linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
@ 2024-05-01 14:04 Jesper Dangaard Brouer
  2024-05-01 14:24 ` Waiman Long
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2024-05-01 14:04 UTC (permalink / raw)
  To: tj, hannes, lizefan.x, cgroups, yosryahmed, longman
  Cc: Jesper Dangaard Brouer, netdev, linux-mm, shakeel.butt,
	kernel-team, Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior

This closely resembles helpers added for the global cgroup_rstat_lock in
commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.

Based on production workloads, we observe the fast-path "update" function
cgroup_rstat_updated() is invoked around 3 million times per sec, while the
"flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
can see periodic spikes of 700 invocations/sec.

For this reason, the tracepoints are split into normal and fastpath
versions for this per-CPU lock. Making it feasible for production to
continuously monitor the non-fastpath tracepoint to detect lock contention
issues. The reason for monitoring is that lock disables IRQs which can
disturb e.g. softirq processing on the local CPUs involved. When the
global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
this per CPU lock becomes the next bottleneck that can introduce latency
variations.

A practical bpftrace script for monitoring contention latency:

 bpftrace -e '
   tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
     @start[tid]=nsecs; @cnt[probe]=count()}
   tracepoint:cgroup:cgroup_rstat_cpu_locked {
     if (args->contended) {
       @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
     @cnt[probe]=count()}
   interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 include/trace/events/cgroup.h |   56 +++++++++++++++++++++++++++++----
 kernel/cgroup/rstat.c         |   70 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 108 insertions(+), 18 deletions(-)

diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index 13f375800135..0b95865a90f3 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
 
 DECLARE_EVENT_CLASS(cgroup_rstat,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
 
-	TP_ARGS(cgrp, cpu_in_loop, contended),
+	TP_ARGS(cgrp, cpu, contended),
 
 	TP_STRUCT__entry(
 		__field(	int,		root			)
 		__field(	int,		level			)
 		__field(	u64,		id			)
-		__field(	int,		cpu_in_loop		)
+		__field(	int,		cpu			)
 		__field(	bool,		contended		)
 	),
 
@@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
 		__entry->root = cgrp->root->hierarchy_id;
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
-		__entry->cpu_in_loop = cpu_in_loop;
+		__entry->cpu = cpu;
 		__entry->contended = contended;
 	),
 
-	TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
+	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
 		  __entry->root, __entry->id, __entry->level,
-		  __entry->cpu_in_loop, __entry->contended)
+		  __entry->cpu, __entry->contended)
 );
 
+/* Related to global: cgroup_rstat_lock */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
 	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
@@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 	TP_ARGS(cgrp, cpu, contended)
 );
 
+/* Related to per CPU: cgroup_rstat_cpu_lock */
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
 #endif /* _TRACE_CGROUP_H */
 
 /* This part must be outside protection */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 52e3b0ed1cee..fb8b49437573 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
 	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
 }
 
+/*
+ * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @fast_path determine the
+ * tracepoints being added, allowing us to diagnose "flush" related
+ * operations without handling high-frequency fast-path "update" events.
+ */
+static __always_inline
+unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
+				     struct cgroup *cgrp, const bool fast_path)
+{
+	unsigned long flags;
+	bool contended;
+
+	/*
+	 * The _irqsave() is needed because cgroup_rstat_lock is
+	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+	 * this lock with the _irq() suffix only disables interrupts on
+	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+	 * interrupts on both configurations. The _irqsave() ensures
+	 * that interrupts are always disabled and later restored.
+	 */
+	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
+	if (contended) {
+		if (fast_path)
+			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
+		else
+			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+
+		raw_spin_lock_irqsave(cpu_lock, flags);
+	}
+
+	if (fast_path)
+		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
+	else
+		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
+
+	return flags;
+}
+
+static __always_inline
+void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
+			      struct cgroup *cgrp, unsigned long flags,
+			      const bool fast_path)
+{
+	if (fast_path)
+		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
+	else
+		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
 /**
  * cgroup_rstat_updated - keep track of updated rstat_cpu
  * @cgrp: target cgroup
@@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
 		return;
 
-	raw_spin_lock_irqsave(cpu_lock, flags);
+	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
 
 	/* put @cgrp and all ancestors on the corresponding updated lists */
 	while (true) {
@@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 		cgrp = parent;
 	}
 
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
+	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
 }
 
 /**
@@ -153,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
 	struct cgroup *head = NULL, *parent, *child;
 	unsigned long flags;
 
-	/*
-	 * The _irqsave() is needed because cgroup_rstat_lock is
-	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-	 * this lock with the _irq() suffix only disables interrupts on
-	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-	 * interrupts on both configurations. The _irqsave() ensures
-	 * that interrupts are always disabled and later restored.
-	 */
-	raw_spin_lock_irqsave(cpu_lock, flags);
+	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
 
 	/* Return NULL if this subtree is not on-list */
 	if (!rstatc->updated_next)
@@ -198,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
 	if (child != root)
 		head = cgroup_rstat_push_children(head, child, cpu);
 unlock_ret:
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
+	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
 	return head;
 }
 




^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-01 14:04 [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints Jesper Dangaard Brouer
@ 2024-05-01 14:24 ` Waiman Long
  2024-05-01 17:22   ` Jesper Dangaard Brouer
  2024-05-14  5:18 ` Jesper Dangaard Brouer
  2024-05-15 16:58 ` Tejun Heo
  2 siblings, 1 reply; 21+ messages in thread
From: Waiman Long @ 2024-05-01 14:24 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, tj, hannes, lizefan.x, cgroups,
	yosryahmed
  Cc: netdev, linux-mm, shakeel.butt, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior

On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
> This closely resembles helpers added for the global cgroup_rstat_lock in
> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>
> Based on production workloads, we observe the fast-path "update" function
> cgroup_rstat_updated() is invoked around 3 million times per sec, while the
> "flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
> can see periodic spikes of 700 invocations/sec.
>
> For this reason, the tracepoints are split into normal and fastpath
> versions for this per-CPU lock. Making it feasible for production to
> continuously monitor the non-fastpath tracepoint to detect lock contention
> issues. The reason for monitoring is that lock disables IRQs which can
> disturb e.g. softirq processing on the local CPUs involved. When the
> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
> this per CPU lock becomes the next bottleneck that can introduce latency
> variations.
>
> A practical bpftrace script for monitoring contention latency:
>
>   bpftrace -e '
>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>       @start[tid]=nsecs; @cnt[probe]=count()}
>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>       if (args->contended) {
>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>       @cnt[probe]=count()}
>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'

This is a per-cpu lock. So the only possible contention involves only 2 
CPUs - a local CPU invoking cgroup_rstat_updated(). A flusher CPU doing 
cgroup_rstat_flush_locked() calling into cgroup_rstat_updated_list(). 
With recent commits to reduce the percpu lock hold time, I doubt lock 
contention on the percpu lock will have a great impact on latency. So do 
we really need such an elaborate scheme to monitor this? BTW, the 
additional code will also add to the worst case latency.

Cheers,
Longman

>
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> ---
>   include/trace/events/cgroup.h |   56 +++++++++++++++++++++++++++++----
>   kernel/cgroup/rstat.c         |   70 ++++++++++++++++++++++++++++++++++-------
>   2 files changed, 108 insertions(+), 18 deletions(-)
>
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index 13f375800135..0b95865a90f3 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>   
>   DECLARE_EVENT_CLASS(cgroup_rstat,
>   
> -	TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>   
> -	TP_ARGS(cgrp, cpu_in_loop, contended),
> +	TP_ARGS(cgrp, cpu, contended),
>   
>   	TP_STRUCT__entry(
>   		__field(	int,		root			)
>   		__field(	int,		level			)
>   		__field(	u64,		id			)
> -		__field(	int,		cpu_in_loop		)
> +		__field(	int,		cpu			)
>   		__field(	bool,		contended		)
>   	),
>   
> @@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>   		__entry->root = cgrp->root->hierarchy_id;
>   		__entry->id = cgroup_id(cgrp);
>   		__entry->level = cgrp->level;
> -		__entry->cpu_in_loop = cpu_in_loop;
> +		__entry->cpu = cpu;
>   		__entry->contended = contended;
>   	),
>   
> -	TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
> +	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
>   		  __entry->root, __entry->id, __entry->level,
> -		  __entry->cpu_in_loop, __entry->contended)
> +		  __entry->cpu, __entry->contended)
>   );
>   
> +/* Related to global: cgroup_rstat_lock */
>   DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>   
>   	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> @@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>   	TP_ARGS(cgrp, cpu, contended)
>   );
>   
> +/* Related to per CPU: cgroup_rstat_cpu_lock */
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
>   #endif /* _TRACE_CGROUP_H */
>   
>   /* This part must be outside protection */
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 52e3b0ed1cee..fb8b49437573 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>   	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>   }
>   
> +/*
> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
> + *
> + * This makes it easier to diagnose locking issues and contention in
> + * production environments. The parameter @fast_path determine the
> + * tracepoints being added, allowing us to diagnose "flush" related
> + * operations without handling high-frequency fast-path "update" events.
> + */
> +static __always_inline
> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
> +				     struct cgroup *cgrp, const bool fast_path)
> +{
> +	unsigned long flags;
> +	bool contended;
> +
> +	/*
> +	 * The _irqsave() is needed because cgroup_rstat_lock is
> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> +	 * this lock with the _irq() suffix only disables interrupts on
> +	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> +	 * interrupts on both configurations. The _irqsave() ensures
> +	 * that interrupts are always disabled and later restored.
> +	 */
> +	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
> +	if (contended) {
> +		if (fast_path)
> +			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
> +		else
> +			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
> +
> +		raw_spin_lock_irqsave(cpu_lock, flags);
> +	}
> +
> +	if (fast_path)
> +		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
> +	else
> +		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
> +
> +	return flags;
> +}
> +
> +static __always_inline
> +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
> +			      struct cgroup *cgrp, unsigned long flags,
> +			      const bool fast_path)
> +{
> +	if (fast_path)
> +		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
> +	else
> +		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
> +
> +	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +}
> +
>   /**
>    * cgroup_rstat_updated - keep track of updated rstat_cpu
>    * @cgrp: target cgroup
> @@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
>   	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
>   		return;
>   
> -	raw_spin_lock_irqsave(cpu_lock, flags);
> +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>   
>   	/* put @cgrp and all ancestors on the corresponding updated lists */
>   	while (true) {
> @@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
>   		cgrp = parent;
>   	}
>   
> -	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
>   }
>   
>   /**
> @@ -153,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>   	struct cgroup *head = NULL, *parent, *child;
>   	unsigned long flags;
>   
> -	/*
> -	 * The _irqsave() is needed because cgroup_rstat_lock is
> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> -	 * this lock with the _irq() suffix only disables interrupts on
> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> -	 * interrupts on both configurations. The _irqsave() ensures
> -	 * that interrupts are always disabled and later restored.
> -	 */
> -	raw_spin_lock_irqsave(cpu_lock, flags);
> +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
>   
>   	/* Return NULL if this subtree is not on-list */
>   	if (!rstatc->updated_next)
> @@ -198,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>   	if (child != root)
>   		head = cgroup_rstat_push_children(head, child, cpu);
>   unlock_ret:
> -	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
>   	return head;
>   }
>   
>
>



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-01 14:24 ` Waiman Long
@ 2024-05-01 17:22   ` Jesper Dangaard Brouer
  2024-05-01 18:41     ` Waiman Long
  2024-05-02 19:44     ` Shakeel Butt
  0 siblings, 2 replies; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2024-05-01 17:22 UTC (permalink / raw)
  To: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed
  Cc: netdev, linux-mm, shakeel.butt, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior



On 01/05/2024 16.24, Waiman Long wrote:
> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>> This closely resembles helpers added for the global cgroup_rstat_lock in
>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>
>> Based on production workloads, we observe the fast-path "update" function
>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>> while the
>> "flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
>> can see periodic spikes of 700 invocations/sec.
>>
>> For this reason, the tracepoints are split into normal and fastpath
>> versions for this per-CPU lock. Making it feasible for production to
>> continuously monitor the non-fastpath tracepoint to detect lock 
>> contention
>> issues. The reason for monitoring is that lock disables IRQs which can
>> disturb e.g. softirq processing on the local CPUs involved. When the
>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
>> this per CPU lock becomes the next bottleneck that can introduce latency
>> variations.
>>
>> A practical bpftrace script for monitoring contention latency:
>>
>>   bpftrace -e '
>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>       if (args->contended) {
>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>       @cnt[probe]=count()}
>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>> clear(@cnt);}'
> 
> This is a per-cpu lock. So the only possible contention involves only 2 
> CPUs - a local CPU invoking cgroup_rstat_updated(). A flusher CPU doing 
> cgroup_rstat_flush_locked() calling into cgroup_rstat_updated_list(). 
> With recent commits to reduce the percpu lock hold time, I doubt lock 
> contention on the percpu lock will have a great impact on latency. 

I do appriciate your recent changes to reduce the percpu lock hold time.
These tracepoints allow me to measure and differentiate the percpu lock
hold time vs. the flush time.

In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
upto 29 ms, which is time spend after obtaining the lock (runtime under
lock).  I was expecting to see "High Lock-contention wait" [L82] which
is the time waiting for obtaining the lock.

This is why I'm adding these tracepoints, as they allow me to digg
deeper, to understand where this high runtime variations originate from.


Data:

  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 comm:kswapd4
  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 comm:kswapd4
  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 comm:kswapd4
  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 comm:kswapd6
  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
comm:kworker/u261:12
  16:52:09  time elapsed: 80 sec (interval = 1 sec)
   Flushes(5033) 294/interval (avg 62/sec)
   Locks(53374) 1748/interval (avg 667/sec)
   Yields(48341) 1454/interval (avg 604/sec)
   Contended(48104) 1450/interval (avg 601/sec)


> So do 
> we really need such an elaborate scheme to monitor this? BTW, the 
> additional code will also add to the worst case latency.

Hmm, I designed this code to have minimal impact, as tracepoints are
no-ops until activated.  I really doubt this code will change the latency.


[1] 
https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt

[L100] 
https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100

[L82] 
https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82

>>
>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>

More data, the histogram of time spend under the lock have some strange
variation issues with a group in 4ms to 65ms area. Investigating what
can be causeing this... which next step depend in these tracepoints.

@lock_cnt: 759146

@locked_ns:
[1K, 2K)             499 | 
      |
[2K, 4K)          206928 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
      |
[8K, 16K)          64453 |@@@@@@@@@@@@@@@@ 
      |
[16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
      |
[32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@ 
      |
[64K, 128K)        38359 |@@@@@@@@@ 
      |
[128K, 256K)       46597 |@@@@@@@@@@@ 
      |
[256K, 512K)       32466 |@@@@@@@@ 
      |
[512K, 1M)          3945 | 
      |
[1M, 2M)             642 | 
      |
[2M, 4M)             750 | 
      |
[4M, 8M)            1932 | 
      |
[8M, 16M)           2114 | 
      |
[16M, 32M)          1039 | 
      |
[32M, 64M)           108 | 
      |




>> ---
>>   include/trace/events/cgroup.h |   56 +++++++++++++++++++++++++++++----
>>   kernel/cgroup/rstat.c         |   70 
>> ++++++++++++++++++++++++++++++++++-------
>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/trace/events/cgroup.h 
>> b/include/trace/events/cgroup.h
>> index 13f375800135..0b95865a90f3 100644
>> --- a/include/trace/events/cgroup.h
>> +++ b/include/trace/events/cgroup.h
>> @@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>>   DECLARE_EVENT_CLASS(cgroup_rstat,
>> -    TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> -    TP_ARGS(cgrp, cpu_in_loop, contended),
>> +    TP_ARGS(cgrp, cpu, contended),
>>       TP_STRUCT__entry(
>>           __field(    int,        root            )
>>           __field(    int,        level            )
>>           __field(    u64,        id            )
>> -        __field(    int,        cpu_in_loop        )
>> +        __field(    int,        cpu            )
>>           __field(    bool,        contended        )
>>       ),
>> @@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>>           __entry->root = cgrp->root->hierarchy_id;
>>           __entry->id = cgroup_id(cgrp);
>>           __entry->level = cgrp->level;
>> -        __entry->cpu_in_loop = cpu_in_loop;
>> +        __entry->cpu = cpu;
>>           __entry->contended = contended;
>>       ),
>> -    TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock 
>> contended:%d",
>> +    TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
>>             __entry->root, __entry->id, __entry->level,
>> -          __entry->cpu_in_loop, __entry->contended)
>> +          __entry->cpu, __entry->contended)
>>   );
>> +/* Related to global: cgroup_rstat_lock */
>>   DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>>       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> @@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>>       TP_ARGS(cgrp, cpu, contended)
>>   );
>> +/* Related to per CPU: cgroup_rstat_cpu_lock */
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>>   #endif /* _TRACE_CGROUP_H */
>>   /* This part must be outside protection */
>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>> index 52e3b0ed1cee..fb8b49437573 100644
>> --- a/kernel/cgroup/rstat.c
>> +++ b/kernel/cgroup/rstat.c
>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>   }
>> +/*
>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>> + *
>> + * This makes it easier to diagnose locking issues and contention in
>> + * production environments. The parameter @fast_path determine the
>> + * tracepoints being added, allowing us to diagnose "flush" related
>> + * operations without handling high-frequency fast-path "update" events.
>> + */
>> +static __always_inline
>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
>> +                     struct cgroup *cgrp, const bool fast_path)
>> +{
>> +    unsigned long flags;
>> +    bool contended;
>> +
>> +    /*
>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>> +     * this lock with the _irq() suffix only disables interrupts on
>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>> +     * interrupts on both configurations. The _irqsave() ensures
>> +     * that interrupts are always disabled and later restored.
>> +     */
>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>> +    if (contended) {
>> +        if (fast_path)
>> +            trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, 
>> contended);
>> +        else
>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
>> +
>> +        raw_spin_lock_irqsave(cpu_lock, flags);
>> +    }
>> +
>> +    if (fast_path)
>> +        trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
>> +    else
>> +        trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
>> +
>> +    return flags;
>> +}
>> +
>> +static __always_inline
>> +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
>> +                  struct cgroup *cgrp, unsigned long flags,
>> +                  const bool fast_path)
>> +{
>> +    if (fast_path)
>> +        trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
>> +    else
>> +        trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
>> +
>> +    raw_spin_unlock_irqrestore(cpu_lock, flags);
>> +}
>> +
>>   /**
>>    * cgroup_rstat_updated - keep track of updated rstat_cpu
>>    * @cgrp: target cgroup
>> @@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup 
>> *cgrp, int cpu)
>>       if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
>>           return;
>> -    raw_spin_lock_irqsave(cpu_lock, flags);
>> +    flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>>       /* put @cgrp and all ancestors on the corresponding updated 
>> lists */
>>       while (true) {
>> @@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup 
>> *cgrp, int cpu)
>>           cgrp = parent;
>>       }
>> -    raw_spin_unlock_irqrestore(cpu_lock, flags);
>> +    _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
>>   }
>>   /**
>> @@ -153,15 +207,7 @@ static struct cgroup 
>> *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>>       struct cgroup *head = NULL, *parent, *child;
>>       unsigned long flags;
>> -    /*
>> -     * The _irqsave() is needed because cgroup_rstat_lock is
>> -     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>> -     * this lock with the _irq() suffix only disables interrupts on
>> -     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>> -     * interrupts on both configurations. The _irqsave() ensures
>> -     * that interrupts are always disabled and later restored.
>> -     */
>> -    raw_spin_lock_irqsave(cpu_lock, flags);
>> +    flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
>>       /* Return NULL if this subtree is not on-list */
>>       if (!rstatc->updated_next)
>> @@ -198,7 +244,7 @@ static struct cgroup 
>> *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>>       if (child != root)
>>           head = cgroup_rstat_push_children(head, child, cpu);
>>   unlock_ret:
>> -    raw_spin_unlock_irqrestore(cpu_lock, flags);
>> +    _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
>>       return head;
>>   }
>>
>>
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-01 17:22   ` Jesper Dangaard Brouer
@ 2024-05-01 18:41     ` Waiman Long
  2024-05-02 11:23       ` Jesper Dangaard Brouer
  2024-05-02 19:44     ` Shakeel Butt
  1 sibling, 1 reply; 21+ messages in thread
From: Waiman Long @ 2024-05-01 18:41 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, tj, hannes, lizefan.x, cgroups,
	yosryahmed
  Cc: netdev, linux-mm, shakeel.butt, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior

[-- Attachment #1: Type: text/plain, Size: 14392 bytes --]

On 5/1/24 13:22, Jesper Dangaard Brouer wrote:
>
>
> On 01/05/2024 16.24, Waiman Long wrote:
>> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>>> This closely resembles helpers added for the global 
>>> cgroup_rstat_lock in
>>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>>
>>> Based on production workloads, we observe the fast-path "update" 
>>> function
>>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>>> while the
>>> "flush" function cgroup_rstat_flush_locked(), walking each possible 
>>> CPU,
>>> can see periodic spikes of 700 invocations/sec.
>>>
>>> For this reason, the tracepoints are split into normal and fastpath
>>> versions for this per-CPU lock. Making it feasible for production to
>>> continuously monitor the non-fastpath tracepoint to detect lock 
>>> contention
>>> issues. The reason for monitoring is that lock disables IRQs which can
>>> disturb e.g. softirq processing on the local CPUs involved. When the
>>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a 
>>> mutex),
>>> this per CPU lock becomes the next bottleneck that can introduce 
>>> latency
>>> variations.
>>>
>>> A practical bpftrace script for monitoring contention latency:
>>>
>>>   bpftrace -e '
>>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>>       if (args->contended) {
>>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>>       @cnt[probe]=count()}
>>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>>> clear(@cnt);}'
>>
>> This is a per-cpu lock. So the only possible contention involves only 
>> 2 CPUs - a local CPU invoking cgroup_rstat_updated(). A flusher CPU 
>> doing cgroup_rstat_flush_locked() calling into 
>> cgroup_rstat_updated_list(). With recent commits to reduce the percpu 
>> lock hold time, I doubt lock contention on the percpu lock will have 
>> a great impact on latency. 
>
> I do appriciate your recent changes to reduce the percpu lock hold time.
> These tracepoints allow me to measure and differentiate the percpu lock
> hold time vs. the flush time.
>
> In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
> upto 29 ms, which is time spend after obtaining the lock (runtime under
> lock).  I was expecting to see "High Lock-contention wait" [L82] which
> is the time waiting for obtaining the lock.
>
> This is why I'm adding these tracepoints, as they allow me to digg
> deeper, to understand where this high runtime variations originate from.
>
>
> Data:
>
>  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 comm:kswapd4
>  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 comm:kswapd4
>  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 comm:kswapd4
>  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 comm:kswapd6
>  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
> comm:kworker/u261:12
That lock hold time is much higher than I would have expected.
>  16:52:09 time elapsed: 80 sec (interval = 1 sec)
>   Flushes(5033) 294/interval (avg 62/sec)
>   Locks(53374) 1748/interval (avg 667/sec)
>   Yields(48341) 1454/interval (avg 604/sec)
>   Contended(48104) 1450/interval (avg 601/sec)
>
>
>> So do we really need such an elaborate scheme to monitor this? BTW, 
>> the additional code will also add to the worst case latency.
>
> Hmm, I designed this code to have minimal impact, as tracepoints are
> no-ops until activated.  I really doubt this code will change the 
> latency.
>
>
> [1] 
> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
>
> [L100] 
> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100
>
> [L82] 
> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82
>
>>>
>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>
> More data, the histogram of time spend under the lock have some strange
> variation issues with a group in 4ms to 65ms area. Investigating what
> can be causeing this... which next step depend in these tracepoints.
>
> @lock_cnt: 759146
>
> @locked_ns:
> [1K, 2K)             499 |      |
> [2K, 4K)          206928 
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
> [64K, 128K)        38359 |@@@@@@@@@      |
> [128K, 256K)       46597 |@@@@@@@@@@@      |
> [256K, 512K)       32466 |@@@@@@@@      |
> [512K, 1M)          3945 |      |
> [1M, 2M)             642 |      |
> [2M, 4M)             750 |      |
> [4M, 8M)            1932 |      |
> [8M, 16M)           2114 |      |
> [16M, 32M)          1039 |      |
> [32M, 64M)           108 |      |
>
>
>
>
>>> ---
>>>   include/trace/events/cgroup.h |   56 
>>> +++++++++++++++++++++++++++++----
>>>   kernel/cgroup/rstat.c         |   70 
>>> ++++++++++++++++++++++++++++++++++-------
>>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/include/trace/events/cgroup.h 
>>> b/include/trace/events/cgroup.h
>>> index 13f375800135..0b95865a90f3 100644
>>> --- a/include/trace/events/cgroup.h
>>> +++ b/include/trace/events/cgroup.h
>>> @@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>>>   DECLARE_EVENT_CLASS(cgroup_rstat,
>>> -    TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>> -    TP_ARGS(cgrp, cpu_in_loop, contended),
>>> +    TP_ARGS(cgrp, cpu, contended),
>>>       TP_STRUCT__entry(
>>>           __field(    int,        root            )
>>>           __field(    int,        level            )
>>>           __field(    u64,        id            )
>>> -        __field(    int,        cpu_in_loop        )
>>> +        __field(    int,        cpu            )
>>>           __field(    bool,        contended        )
>>>       ),
>>> @@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>>>           __entry->root = cgrp->root->hierarchy_id;
>>>           __entry->id = cgroup_id(cgrp);
>>>           __entry->level = cgrp->level;
>>> -        __entry->cpu_in_loop = cpu_in_loop;
>>> +        __entry->cpu = cpu;
>>>           __entry->contended = contended;
>>>       ),
>>> -    TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock 
>>> contended:%d",
>>> +    TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
>>>             __entry->root, __entry->id, __entry->level,
>>> -          __entry->cpu_in_loop, __entry->contended)
>>> +          __entry->cpu, __entry->contended)
>>>   );
>>> +/* Related to global: cgroup_rstat_lock */
>>>   DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>>>       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>> @@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>>>       TP_ARGS(cgrp, cpu, contended)
>>>   );
>>> +/* Related to per CPU: cgroup_rstat_cpu_lock */
>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
>>> +
>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>> +
>>> +    TP_ARGS(cgrp, cpu, contended)
>>> +);
>>> +
>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
>>> +
>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>> +
>>> +    TP_ARGS(cgrp, cpu, contended)
>>> +);
>>> +
>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
>>> +
>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>> +
>>> +    TP_ARGS(cgrp, cpu, contended)
>>> +);
>>> +
>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
>>> +
>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>> +
>>> +    TP_ARGS(cgrp, cpu, contended)
>>> +);
>>> +
>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
>>> +
>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>> +
>>> +    TP_ARGS(cgrp, cpu, contended)
>>> +);
>>> +
>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>>> +
>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>> +
>>> +    TP_ARGS(cgrp, cpu, contended)
>>> +);
>>> +
>>>   #endif /* _TRACE_CGROUP_H */
>>>   /* This part must be outside protection */
>>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>>> index 52e3b0ed1cee..fb8b49437573 100644
>>> --- a/kernel/cgroup/rstat.c
>>> +++ b/kernel/cgroup/rstat.c
>>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>>   }
>>> +/*
>>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>>> + *
>>> + * This makes it easier to diagnose locking issues and contention in
>>> + * production environments. The parameter @fast_path determine the
>>> + * tracepoints being added, allowing us to diagnose "flush" related
>>> + * operations without handling high-frequency fast-path "update" 
>>> events.
>>> + */
>>> +static __always_inline
>>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int 
>>> cpu,
>>> +                     struct cgroup *cgrp, const bool fast_path)
>>> +{
>>> +    unsigned long flags;
>>> +    bool contended;
>>> +
>>> +    /*
>>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>>> +     * this lock with the _irq() suffix only disables interrupts on
>>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>>> +     * interrupts on both configurations. The _irqsave() ensures
>>> +     * that interrupts are always disabled and later restored.
>>> +     */
>>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>> +    if (contended) {
>>> +        if (fast_path)
>>> + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
>>> +        else
>>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, 
>>> contended);
>>> +
>>> +        raw_spin_lock_irqsave(cpu_lock, flags);

Could you do a local_irq_save() before calling trace_cgroup*() and 
raw_spin_lock()? Would that help in eliminating this high lock hold time?

You can also do a local_irq_save() first before the trylock. That will 
eliminate the duplicated irq_restore() and irq_save() when there is 
contention.

If not, there may be NMIs mixed in.

Thanks,
Longman

>>> +    }
>>> +
>>> +    if (fast_path)
>>> +        trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
>>> +    else
>>> +        trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
>>> +
>>> +    return flags;
>>> +}
>>> +
>>> +static __always_inline
>>> +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
>>> +                  struct cgroup *cgrp, unsigned long flags,
>>> +                  const bool fast_path)
>>> +{
>>> +    if (fast_path)
>>> +        trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
>>> +    else
>>> +        trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
>>> +
>>> +    raw_spin_unlock_irqrestore(cpu_lock, flags);
>>> +}
>>> +
>>>   /**
>>>    * cgroup_rstat_updated - keep track of updated rstat_cpu
>>>    * @cgrp: target cgroup
>>> @@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct 
>>> cgroup *cgrp, int cpu)
>>>       if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
>>>           return;
>>> -    raw_spin_lock_irqsave(cpu_lock, flags);
>>> +    flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>>>       /* put @cgrp and all ancestors on the corresponding updated 
>>> lists */
>>>       while (true) {
>>> @@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct 
>>> cgroup *cgrp, int cpu)
>>>           cgrp = parent;
>>>       }
>>> -    raw_spin_unlock_irqrestore(cpu_lock, flags);
>>> +    _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
>>>   }
>>>   /**
>>> @@ -153,15 +207,7 @@ static struct cgroup 
>>> *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>>>       struct cgroup *head = NULL, *parent, *child;
>>>       unsigned long flags;
>>> -    /*
>>> -     * The _irqsave() is needed because cgroup_rstat_lock is
>>> -     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>>> -     * this lock with the _irq() suffix only disables interrupts on
>>> -     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>>> -     * interrupts on both configurations. The _irqsave() ensures
>>> -     * that interrupts are always disabled and later restored.
>>> -     */
>>> -    raw_spin_lock_irqsave(cpu_lock, flags);
>>> +    flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
>>>       /* Return NULL if this subtree is not on-list */
>>>       if (!rstatc->updated_next)
>>> @@ -198,7 +244,7 @@ static struct cgroup 
>>> *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>>>       if (child != root)
>>>           head = cgroup_rstat_push_children(head, child, cpu); 
>>> Changes by *Zhili Li* on 2024/04/18 9:51 PM
>>>   unlock_ret:
>>> -    raw_spin_unlock_irqrestore(cpu_lock, flags);
>>> +    _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
>>>       return head;
>>>   }
>>>
>>>
>>
>

[-- Attachment #2: Type: text/html, Size: 22193 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-01 18:41     ` Waiman Long
@ 2024-05-02 11:23       ` Jesper Dangaard Brouer
  2024-05-02 18:19         ` Waiman Long
  0 siblings, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2024-05-02 11:23 UTC (permalink / raw)
  To: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed
  Cc: netdev, linux-mm, shakeel.butt, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior



On 01/05/2024 20.41, Waiman Long wrote:
> On 5/1/24 13:22, Jesper Dangaard Brouer wrote:
>>
>>
>> On 01/05/2024 16.24, Waiman Long wrote:
>>> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>>>> This closely resembles helpers added for the global 
>>>> cgroup_rstat_lock in
>>>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>>>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>>>
>>>> Based on production workloads, we observe the fast-path "update" 
>>>> function
>>>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>>>> while the
>>>> "flush" function cgroup_rstat_flush_locked(), walking each possible 
>>>> CPU,
>>>> can see periodic spikes of 700 invocations/sec.
>>>>
>>>> For this reason, the tracepoints are split into normal and fastpath
>>>> versions for this per-CPU lock. Making it feasible for production to
>>>> continuously monitor the non-fastpath tracepoint to detect lock 
>>>> contention
>>>> issues. The reason for monitoring is that lock disables IRQs which can
>>>> disturb e.g. softirq processing on the local CPUs involved. When the
>>>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a 
>>>> mutex),
>>>> this per CPU lock becomes the next bottleneck that can introduce 
>>>> latency
>>>> variations.
>>>>
>>>> A practical bpftrace script for monitoring contention latency:
>>>>
>>>>   bpftrace -e '
>>>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>>>       if (args->contended) {
>>>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>>>       @cnt[probe]=count()}
>>>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>>>> clear(@cnt);}'
>>>
>>> This is a per-cpu lock. So the only possible contention involves only 
>>> 2 CPUs - a local CPU invoking cgroup_rstat_updated(). A flusher CPU 
>>> doing cgroup_rstat_flush_locked() calling into 
>>> cgroup_rstat_updated_list(). With recent commits to reduce the percpu 
>>> lock hold time, I doubt lock contention on the percpu lock will have 
>>> a great impact on latency. 
>>
>> I do appriciate your recent changes to reduce the percpu lock hold time.
>> These tracepoints allow me to measure and differentiate the percpu lock
>> hold time vs. the flush time.
>>
>> In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
>> upto 29 ms, which is time spend after obtaining the lock (runtime under
>> lock).  I was expecting to see "High Lock-contention wait" [L82] which
>> is the time waiting for obtaining the lock.
>>
>> This is why I'm adding these tracepoints, as they allow me to digg
>> deeper, to understand where this high runtime variations originate from.
>>
>>
>> Data:
>>
>>  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 comm:kswapd4
>>  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 comm:kswapd4
>>  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 comm:kswapd4
>>  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 comm:kswapd6
>>  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
>> comm:kworker/u261:12
> That lock hold time is much higher than I would have expected.
>>  16:52:09 time elapsed: 80 sec (interval = 1 sec)
>>   Flushes(5033) 294/interval (avg 62/sec)
>>   Locks(53374) 1748/interval (avg 667/sec)
>>   Yields(48341) 1454/interval (avg 604/sec)
>>   Contended(48104) 1450/interval (avg 601/sec)
>>
>>
>>> So do we really need such an elaborate scheme to monitor this? BTW, 
>>> the additional code will also add to the worst case latency.
>>
>> Hmm, I designed this code to have minimal impact, as tracepoints are
>> no-ops until activated.  I really doubt this code will change the 
>> latency.
>>
>>
>> [1] 
>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
>>
>> [L100] 
>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100
>>
>> [L82] 
>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82
>>
>>>>
>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>
>> More data, the histogram of time spend under the lock have some strange
>> variation issues with a group in 4ms to 65ms area. Investigating what
>> can be causeing this... which next step depend in these tracepoints.
>>
>> @lock_cnt: 759146
>>
>> @locked_ns:
>> [1K, 2K)             499 |      |
>> [2K, 4K)          206928 
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
>> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
>> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
>> [64K, 128K)        38359 |@@@@@@@@@      |
>> [128K, 256K)       46597 |@@@@@@@@@@@      |
>> [256K, 512K)       32466 |@@@@@@@@      |
>> [512K, 1M)          3945 |      |
>> [1M, 2M)             642 |      |
>> [2M, 4M)             750 |      |
>> [4M, 8M)            1932 |      |
>> [8M, 16M)           2114 |      |
>> [16M, 32M)          1039 |      |
>> [32M, 64M)           108 |      |
>>
>>
>>
>>
>>>> ---
>>>>   include/trace/events/cgroup.h |   56 
>>>> +++++++++++++++++++++++++++++----
>>>>   kernel/cgroup/rstat.c         |   70 
>>>> ++++++++++++++++++++++++++++++++++-------
>>>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/include/trace/events/cgroup.h 
>>>> b/include/trace/events/cgroup.h
>>>> index 13f375800135..0b95865a90f3 100644
>>>> --- a/include/trace/events/cgroup.h
[...]
>>>> +++ b/include/trace/events/cgroup.h >>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>>>> +
>>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>>> +
>>>> +    TP_ARGS(cgrp, cpu, contended)
>>>> +);
>>>> +
>>>>   #endif /* _TRACE_CGROUP_H */
>>>>   /* This part must be outside protection */
>>>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>>>> index 52e3b0ed1cee..fb8b49437573 100644
>>>> --- a/kernel/cgroup/rstat.c
>>>> +++ b/kernel/cgroup/rstat.c
>>>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>>>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>>>   }
>>>> +/*
>>>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>>>> + *
>>>> + * This makes it easier to diagnose locking issues and contention in
>>>> + * production environments. The parameter @fast_path determine the
>>>> + * tracepoints being added, allowing us to diagnose "flush" related
>>>> + * operations without handling high-frequency fast-path "update" 
>>>> events.
>>>> + */
>>>> +static __always_inline
>>>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int 
>>>> cpu,
>>>> +                     struct cgroup *cgrp, const bool fast_path)
>>>> +{
>>>> +    unsigned long flags;
>>>> +    bool contended;
>>>> +
>>>> +    /*
>>>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>>>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>>>> +     * this lock with the _irq() suffix only disables interrupts on
>>>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>>>> +     * interrupts on both configurations. The _irqsave() ensures
>>>> +     * that interrupts are always disabled and later restored.
>>>> +     */
>>>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>>> +    if (contended) {
>>>> +        if (fast_path)
>>>> + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
>>>> +        else
>>>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, 
>>>> contended);
>>>> +
>>>> +        raw_spin_lock_irqsave(cpu_lock, flags);
> 
> Could you do a local_irq_save() before calling trace_cgroup*() and 
> raw_spin_lock()? Would that help in eliminating this high lock hold time?
> 

Nope it will not eliminating high lock *hold* time, because the hold
start timestamp is first taken *AFTER* obtaining the lock.

It could help the contended "wait-time" measurement, but my prod
measurements show this isn't an issues.

> You can also do a local_irq_save() first before the trylock. That will 
> eliminate the duplicated irq_restore() and irq_save() when there is 
> contention.

I wrote the code like this on purpose ;-)
My issue with this code/lock is it cause latency issues for softirq 
NET_RX. So, when I detect a "contended" lock event, I do want a 
irq_restore() as that will allow networking/do_softirq() to run before I 
start waiting for the lock (with IRQ disabled).


> If not, there may be NMIs mixed in.
> 

NMIs are definitely on my list of things to investigate.
These AMD CPUs also have other types of interrupts that needs a close look.

The easier explaination is that the lock isn't "yielded" on every cycle
through the for each CPU loop.

Lets look at the data I provided above:

 >>   Flushes(5033) 294/interval (avg 62/sec)
 >>   Locks(53374) 1748/interval (avg 667/sec)
 >>   Yields(48341) 1454/interval (avg 604/sec)
 >>   Contended(48104) 1450/interval (avg 601/sec)

In this 1 second sample, we have 294 flushes, and more yields 1454,
great but the factor is not 128 (num-of-CPUs) but closer to 5.  Thus, on
average we hold the lock for (128/5) 25.6 CPUs-walks.

We have spoken about releasing the lock on for_each CPU before... it
will likely solve this long hold time, but IMHO a mutex is still the
better solution.

--Jesper


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-02 11:23       ` Jesper Dangaard Brouer
@ 2024-05-02 18:19         ` Waiman Long
  2024-05-03 14:00           ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 21+ messages in thread
From: Waiman Long @ 2024-05-02 18:19 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, tj, hannes, lizefan.x, cgroups,
	yosryahmed
  Cc: netdev, linux-mm, shakeel.butt, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior

On 5/2/24 07:23, Jesper Dangaard Brouer wrote:
>
>
> On 01/05/2024 20.41, Waiman Long wrote:
>> On 5/1/24 13:22, Jesper Dangaard Brouer wrote:
>>>
>>>
>>> On 01/05/2024 16.24, Waiman Long wrote:
>>>> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>>>>> This closely resembles helpers added for the global 
>>>>> cgroup_rstat_lock in
>>>>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>>>>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>>>>
>>>>> Based on production workloads, we observe the fast-path "update" 
>>>>> function
>>>>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>>>>> while the
>>>>> "flush" function cgroup_rstat_flush_locked(), walking each 
>>>>> possible CPU,
>>>>> can see periodic spikes of 700 invocations/sec.
>>>>>
>>>>> For this reason, the tracepoints are split into normal and fastpath
>>>>> versions for this per-CPU lock. Making it feasible for production to
>>>>> continuously monitor the non-fastpath tracepoint to detect lock 
>>>>> contention
>>>>> issues. The reason for monitoring is that lock disables IRQs which 
>>>>> can
>>>>> disturb e.g. softirq processing on the local CPUs involved. When the
>>>>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a 
>>>>> mutex),
>>>>> this per CPU lock becomes the next bottleneck that can introduce 
>>>>> latency
>>>>> variations.
>>>>>
>>>>> A practical bpftrace script for monitoring contention latency:
>>>>>
>>>>>   bpftrace -e '
>>>>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>>>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>>>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>>>>       if (args->contended) {
>>>>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>>>>       @cnt[probe]=count()}
>>>>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>>>>> clear(@cnt);}'
>>>>
>>>> This is a per-cpu lock. So the only possible contention involves 
>>>> only 2 CPUs - a local CPU invoking cgroup_rstat_updated(). A 
>>>> flusher CPU doing cgroup_rstat_flush_locked() calling into 
>>>> cgroup_rstat_updated_list(). With recent commits to reduce the 
>>>> percpu lock hold time, I doubt lock contention on the percpu lock 
>>>> will have a great impact on latency. 
>>>
>>> I do appriciate your recent changes to reduce the percpu lock hold 
>>> time.
>>> These tracepoints allow me to measure and differentiate the percpu lock
>>> hold time vs. the flush time.
>>>
>>> In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
>>> upto 29 ms, which is time spend after obtaining the lock (runtime under
>>> lock).  I was expecting to see "High Lock-contention wait" [L82] which
>>> is the time waiting for obtaining the lock.
>>>
>>> This is why I'm adding these tracepoints, as they allow me to digg
>>> deeper, to understand where this high runtime variations originate 
>>> from.
>>>
>>>
>>> Data:
>>>
>>>  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 
>>> comm:kswapd4
>>>  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 
>>> comm:kswapd4
>>>  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 
>>> comm:kswapd4
>>>  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 
>>> comm:kswapd6
>>>  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
>>> comm:kworker/u261:12
>> That lock hold time is much higher than I would have expected.
>>>  16:52:09 time elapsed: 80 sec (interval = 1 sec)
>>>   Flushes(5033) 294/interval (avg 62/sec)
>>>   Locks(53374) 1748/interval (avg 667/sec)
>>>   Yields(48341) 1454/interval (avg 604/sec)
>>>   Contended(48104) 1450/interval (avg 601/sec)
>>>
>>>
>>>> So do we really need such an elaborate scheme to monitor this? BTW, 
>>>> the additional code will also add to the worst case latency.
>>>
>>> Hmm, I designed this code to have minimal impact, as tracepoints are
>>> no-ops until activated.  I really doubt this code will change the 
>>> latency.
>>>
>>>
>>> [1] 
>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
>>>
>>> [L100] 
>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100
>>>
>>> [L82] 
>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82
>>>
>>>>>
>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>
>>> More data, the histogram of time spend under the lock have some strange
>>> variation issues with a group in 4ms to 65ms area. Investigating what
>>> can be causeing this... which next step depend in these tracepoints.
>>>
>>> @lock_cnt: 759146
>>>
>>> @locked_ns:
>>> [1K, 2K)             499 |      |
>>> [2K, 4K)          206928 
>>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>>> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
>>> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
>>> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
>>> [64K, 128K)        38359 |@@@@@@@@@      |
>>> [128K, 256K)       46597 |@@@@@@@@@@@      |
>>> [256K, 512K)       32466 |@@@@@@@@      |
>>> [512K, 1M)          3945 |      |
>>> [1M, 2M)             642 |      |
>>> [2M, 4M)             750 |      |
>>> [4M, 8M)            1932 |      |
>>> [8M, 16M)           2114 |      |
>>> [16M, 32M)          1039 |      |
>>> [32M, 64M)           108 |      |
>>>
>>>
>>>
>>>
>>>>> ---
>>>>>   include/trace/events/cgroup.h |   56 
>>>>> +++++++++++++++++++++++++++++----
>>>>>   kernel/cgroup/rstat.c         |   70 
>>>>> ++++++++++++++++++++++++++++++++++-------
>>>>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/include/trace/events/cgroup.h 
>>>>> b/include/trace/events/cgroup.h
>>>>> index 13f375800135..0b95865a90f3 100644
>>>>> --- a/include/trace/events/cgroup.h
> [...]
>>>>> +++ b/include/trace/events/cgroup.h >>>> 
>>>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>>>>> +
>>>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>>>> +
>>>>> +    TP_ARGS(cgrp, cpu, contended)
>>>>> +);
>>>>> +
>>>>>   #endif /* _TRACE_CGROUP_H */
>>>>>   /* This part must be outside protection */
>>>>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>>>>> index 52e3b0ed1cee..fb8b49437573 100644
>>>>> --- a/kernel/cgroup/rstat.c
>>>>> +++ b/kernel/cgroup/rstat.c
>>>>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>>>>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>>>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>>>>   }
>>>>> +/*
>>>>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>>>>> + *
>>>>> + * This makes it easier to diagnose locking issues and contention in
>>>>> + * production environments. The parameter @fast_path determine the
>>>>> + * tracepoints being added, allowing us to diagnose "flush" related
>>>>> + * operations without handling high-frequency fast-path "update" 
>>>>> events.
>>>>> + */
>>>>> +static __always_inline
>>>>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, 
>>>>> int cpu,
>>>>> +                     struct cgroup *cgrp, const bool fast_path)
>>>>> +{
>>>>> +    unsigned long flags;
>>>>> +    bool contended;
>>>>> +
>>>>> +    /*
>>>>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>>>>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>>>>> +     * this lock with the _irq() suffix only disables interrupts on
>>>>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>>>>> +     * interrupts on both configurations. The _irqsave() ensures
>>>>> +     * that interrupts are always disabled and later restored.
>>>>> +     */
>>>>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>>>> +    if (contended) {
>>>>> +        if (fast_path)
>>>>> + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, 
>>>>> contended);
>>>>> +        else
>>>>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, 
>>>>> contended);
>>>>> +
>>>>> +        raw_spin_lock_irqsave(cpu_lock, flags);
>>
>> Could you do a local_irq_save() before calling trace_cgroup*() and 
>> raw_spin_lock()? Would that help in eliminating this high lock hold 
>> time?
>>
>
> Nope it will not eliminating high lock *hold* time, because the hold
> start timestamp is first taken *AFTER* obtaining the lock.
>
> It could help the contended "wait-time" measurement, but my prod
> measurements show this isn't an issues.

Right.


>
>> You can also do a local_irq_save() first before the trylock. That 
>> will eliminate the duplicated irq_restore() and irq_save() when there 
>> is contention.
>
> I wrote the code like this on purpose ;-)
> My issue with this code/lock is it cause latency issues for softirq 
> NET_RX. So, when I detect a "contended" lock event, I do want a 
> irq_restore() as that will allow networking/do_softirq() to run before 
> I start waiting for the lock (with IRQ disabled).
>
Assuming the time taken by the tracing code is negligible, we are 
talking about disabling IRQ almost immediate after enabling it. The 
trylock time should be relatively short so the additional delay due to 
irq disabled for the whole period is insignificant.
>
>> If not, there may be NMIs mixed in.
>>
>
> NMIs are definitely on my list of things to investigate.
> These AMD CPUs also have other types of interrupts that needs a close 
> look.
>
> The easier explaination is that the lock isn't "yielded" on every cycle
> through the for each CPU loop.
>
> Lets look at the data I provided above:
>
> >>   Flushes(5033) 294/interval (avg 62/sec)
> >>   Locks(53374) 1748/interval (avg 667/sec)
> >>   Yields(48341) 1454/interval (avg 604/sec)
> >>   Contended(48104) 1450/interval (avg 601/sec)
>
> In this 1 second sample, we have 294 flushes, and more yields 1454,
> great but the factor is not 128 (num-of-CPUs) but closer to 5. Thus, on
> average we hold the lock for (128/5) 25.6 CPUs-walks.
>
> We have spoken about releasing the lock on for_each CPU before... it
> will likely solve this long hold time, but IMHO a mutex is still the
> better solution.

I may have mistakenly thinking the lock hold time refers to just the 
cpu_lock. Your reported times here are about the cgroup_rstat_lock. 
Right? If so, the numbers make sense to me.

Cheers,
Longman



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-01 17:22   ` Jesper Dangaard Brouer
  2024-05-01 18:41     ` Waiman Long
@ 2024-05-02 19:44     ` Shakeel Butt
  2024-05-03 12:58       ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 21+ messages in thread
From: Shakeel Butt @ 2024-05-02 19:44 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed, netdev,
	linux-mm, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior

On Wed, May 01, 2024 at 07:22:26PM +0200, Jesper Dangaard Brouer wrote:
> 
[...]
> 
> More data, the histogram of time spend under the lock have some strange
> variation issues with a group in 4ms to 65ms area. Investigating what
> can be causeing this... which next step depend in these tracepoints.
> 
> @lock_cnt: 759146
> 
> @locked_ns:
> [1K, 2K)             499 |      |
> [2K, 4K)          206928
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
> [64K, 128K)        38359 |@@@@@@@@@      |
> [128K, 256K)       46597 |@@@@@@@@@@@      |
> [256K, 512K)       32466 |@@@@@@@@      |
> [512K, 1M)          3945 |      |
> [1M, 2M)             642 |      |
> [2M, 4M)             750 |      |
> [4M, 8M)            1932 |      |
> [8M, 16M)           2114 |      |
> [16M, 32M)          1039 |      |
> [32M, 64M)           108 |      |
> 

Am I understanding correctly that 1K is 1 microsecond and 1M is 1
millisecond? Is it possible to further divide this table into update
side and flush side?



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-02 19:44     ` Shakeel Butt
@ 2024-05-03 12:58       ` Jesper Dangaard Brouer
  2024-05-03 18:11         ` Shakeel Butt
  0 siblings, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2024-05-03 12:58 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed, netdev,
	linux-mm, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior



On 02/05/2024 21.44, Shakeel Butt wrote:
> On Wed, May 01, 2024 at 07:22:26PM +0200, Jesper Dangaard Brouer wrote:
>>
> [...]
>>
>> More data, the histogram of time spend under the lock have some strange
>> variation issues with a group in 4ms to 65ms area. Investigating what
>> can be causeing this... which next step depend in these tracepoints.
>>
>> @lock_cnt: 759146
>>
>> @locked_ns:
>> [1K, 2K)             499 |      |
>> [2K, 4K)          206928
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
>> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
>> [64K, 128K)        38359 |@@@@@@@@@      |
>> [128K, 256K)       46597 |@@@@@@@@@@@      |
>> [256K, 512K)       32466 |@@@@@@@@      |
>> [512K, 1M)          3945 |      |
>> [1M, 2M)             642 |      |
>> [2M, 4M)             750 |      |
>> [4M, 8M)            1932 |      |
>> [8M, 16M)           2114 |      |
>> [16M, 32M)          1039 |      |
>> [32M, 64M)           108 |      |
>>
> 
> Am I understanding correctly that 1K is 1 microsecond and 1M is 1
> millisecond? 

Correct.

> Is it possible to further divide this table into update
> side and flush side?
>

This is *only* flush side.

You question indicate, that we are talking past each-other ;-)

Measurements above is with (recently) accepted tracepoints (e.g. not the
proposed tracepoints in this patch).  I'm arguing with existing
tracepoint that I'm seeing this data, and arguing I need per-CPU
tracepoints to dig deeper into this (as proposed in this patch).

The "update side" can only be measured once we apply this patch.

This morning I got 6 prod machines booted with new kernels, that contain 
this proposed per-CPU lock tracepoint patch.  And 3 of these machines 
have the Mutex lock change also.  No data to share yet...

--Jesper


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-02 18:19         ` Waiman Long
@ 2024-05-03 14:00           ` Jesper Dangaard Brouer
  2024-05-03 14:30             ` Waiman Long
  2024-05-03 19:18             ` Shakeel Butt
  0 siblings, 2 replies; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2024-05-03 14:00 UTC (permalink / raw)
  To: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed
  Cc: netdev, linux-mm, shakeel.butt, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior



On 02/05/2024 20.19, Waiman Long wrote:
> On 5/2/24 07:23, Jesper Dangaard Brouer wrote:
>>
>>
>> On 01/05/2024 20.41, Waiman Long wrote:
>>> On 5/1/24 13:22, Jesper Dangaard Brouer wrote:
>>>>
>>>>
>>>> On 01/05/2024 16.24, Waiman Long wrote:
>>>>> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>>>>>> This closely resembles helpers added for the global 
>>>>>> cgroup_rstat_lock in
>>>>>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>>>>>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>>>>>
>>>>>> Based on production workloads, we observe the fast-path "update" 
>>>>>> function
>>>>>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>>>>>> while the
>>>>>> "flush" function cgroup_rstat_flush_locked(), walking each 
>>>>>> possible CPU,
>>>>>> can see periodic spikes of 700 invocations/sec.
>>>>>>
>>>>>> For this reason, the tracepoints are split into normal and fastpath
>>>>>> versions for this per-CPU lock. Making it feasible for production to
>>>>>> continuously monitor the non-fastpath tracepoint to detect lock 
>>>>>> contention
>>>>>> issues. The reason for monitoring is that lock disables IRQs which 
>>>>>> can
>>>>>> disturb e.g. softirq processing on the local CPUs involved. When the
>>>>>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a 
>>>>>> mutex),
>>>>>> this per CPU lock becomes the next bottleneck that can introduce 
>>>>>> latency
>>>>>> variations.
>>>>>>
>>>>>> A practical bpftrace script for monitoring contention latency:
>>>>>>
>>>>>>   bpftrace -e '
>>>>>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>>>>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>>>>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>>>>>       if (args->contended) {
>>>>>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>>>>>       @cnt[probe]=count()}
>>>>>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>>>>>> clear(@cnt);}'
>>>>>
>>>>> This is a per-cpu lock. So the only possible contention involves 
>>>>> only 2 CPUs - a local CPU invoking cgroup_rstat_updated(). A 
>>>>> flusher CPU doing cgroup_rstat_flush_locked() calling into 
>>>>> cgroup_rstat_updated_list(). With recent commits to reduce the 
>>>>> percpu lock hold time, I doubt lock contention on the percpu lock 
>>>>> will have a great impact on latency. 
>>>>
>>>> I do appriciate your recent changes to reduce the percpu lock hold 
>>>> time.
>>>> These tracepoints allow me to measure and differentiate the percpu lock
>>>> hold time vs. the flush time.
>>>>
>>>> In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
>>>> upto 29 ms, which is time spend after obtaining the lock (runtime under
>>>> lock).  I was expecting to see "High Lock-contention wait" [L82] which
>>>> is the time waiting for obtaining the lock.
>>>>
>>>> This is why I'm adding these tracepoints, as they allow me to digg
>>>> deeper, to understand where this high runtime variations originate 
>>>> from.
>>>>
>>>>
>>>> Data:
>>>>
>>>>  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 
>>>> comm:kswapd4
>>>>  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 
>>>> comm:kswapd4
>>>>  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 
>>>> comm:kswapd4
>>>>  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 
>>>> comm:kswapd6
>>>>  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
>>>> comm:kworker/u261:12
>>> That lock hold time is much higher than I would have expected.
>>>>  16:52:09 time elapsed: 80 sec (interval = 1 sec)
>>>>   Flushes(5033) 294/interval (avg 62/sec)
>>>>   Locks(53374) 1748/interval (avg 667/sec)
>>>>   Yields(48341) 1454/interval (avg 604/sec)
>>>>   Contended(48104) 1450/interval (avg 601/sec)
>>>>
>>>>
>>>>> So do we really need such an elaborate scheme to monitor this? BTW, 
>>>>> the additional code will also add to the worst case latency.
>>>>
>>>> Hmm, I designed this code to have minimal impact, as tracepoints are
>>>> no-ops until activated.  I really doubt this code will change the 
>>>> latency.
>>>>
>>>>
>>>> [1] 
>>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
>>>>
>>>> [L100] 
>>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100
>>>>
>>>> [L82] 
>>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82
>>>>
>>>>>>
>>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>
>>>> More data, the histogram of time spend under the lock have some strange
>>>> variation issues with a group in 4ms to 65ms area. Investigating what
>>>> can be causeing this... which next step depend in these tracepoints.
>>>>
>>>> @lock_cnt: 759146
>>>>
>>>> @locked_ns:
>>>> [1K, 2K)             499 |      |
>>>> [2K, 4K)          206928 
>>>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>>> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>>>> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
>>>> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
>>>> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
>>>> [64K, 128K)        38359 |@@@@@@@@@      |
>>>> [128K, 256K)       46597 |@@@@@@@@@@@      |
>>>> [256K, 512K)       32466 |@@@@@@@@      |
>>>> [512K, 1M)          3945 |      |
>>>> [1M, 2M)             642 |      |
>>>> [2M, 4M)             750 |      |
>>>> [4M, 8M)            1932 |      |
>>>> [8M, 16M)           2114 |      |
>>>> [16M, 32M)          1039 |      |
>>>> [32M, 64M)           108 |      |
>>>>
>>>>
>>>>
>>>>
>>>>>> ---
>>>>>>   include/trace/events/cgroup.h |   56 
>>>>>> +++++++++++++++++++++++++++++----
>>>>>>   kernel/cgroup/rstat.c         |   70 
>>>>>> ++++++++++++++++++++++++++++++++++-------
>>>>>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>>>>>
>>>>>> diff --git a/include/trace/events/cgroup.h 
>>>>>> b/include/trace/events/cgroup.h
>>>>>> index 13f375800135..0b95865a90f3 100644
>>>>>> --- a/include/trace/events/cgroup.h
>> [...]
>>>>>> +++ b/include/trace/events/cgroup.h >>>> 
>>>>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>>>>>> +
>>>>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>>>>> +
>>>>>> +    TP_ARGS(cgrp, cpu, contended)
>>>>>> +);
>>>>>> +
>>>>>>   #endif /* _TRACE_CGROUP_H */
>>>>>>   /* This part must be outside protection */
>>>>>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>>>>>> index 52e3b0ed1cee..fb8b49437573 100644
>>>>>> --- a/kernel/cgroup/rstat.c
>>>>>> +++ b/kernel/cgroup/rstat.c
>>>>>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>>>>>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>>>>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>>>>>   }
>>>>>> +/*
>>>>>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>>>>>> + *
>>>>>> + * This makes it easier to diagnose locking issues and contention in
>>>>>> + * production environments. The parameter @fast_path determine the
>>>>>> + * tracepoints being added, allowing us to diagnose "flush" related
>>>>>> + * operations without handling high-frequency fast-path "update" 
>>>>>> events.
>>>>>> + */
>>>>>> +static __always_inline
>>>>>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, 
>>>>>> int cpu,
>>>>>> +                     struct cgroup *cgrp, const bool fast_path)
>>>>>> +{
>>>>>> +    unsigned long flags;
>>>>>> +    bool contended;
>>>>>> +
>>>>>> +    /*
>>>>>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>>>>>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>>>>>> +     * this lock with the _irq() suffix only disables interrupts on
>>>>>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>>>>>> +     * interrupts on both configurations. The _irqsave() ensures
>>>>>> +     * that interrupts are always disabled and later restored.
>>>>>> +     */
>>>>>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>>>>> +    if (contended) {
>>>>>> +        if (fast_path)
>>>>>> + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, 
>>>>>> contended);
>>>>>> +        else
>>>>>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, 
>>>>>> contended);
>>>>>> +
>>>>>> +        raw_spin_lock_irqsave(cpu_lock, flags);
>>>
>>> Could you do a local_irq_save() before calling trace_cgroup*() and 
>>> raw_spin_lock()? Would that help in eliminating this high lock hold 
>>> time?
>>>
>>
>> Nope it will not eliminating high lock *hold* time, because the hold
>> start timestamp is first taken *AFTER* obtaining the lock.
>>
>> It could help the contended "wait-time" measurement, but my prod
>> measurements show this isn't an issues.
> 
> Right.
> 
> 
>>
>>> You can also do a local_irq_save() first before the trylock. That 
>>> will eliminate the duplicated irq_restore() and irq_save() when there 
>>> is contention.
>>
>> I wrote the code like this on purpose ;-)
>> My issue with this code/lock is it cause latency issues for softirq 
>> NET_RX. So, when I detect a "contended" lock event, I do want a 
>> irq_restore() as that will allow networking/do_softirq() to run before 
>> I start waiting for the lock (with IRQ disabled).
>>
> Assuming the time taken by the tracing code is negligible, we are 
> talking about disabling IRQ almost immediate after enabling it. The 
> trylock time should be relatively short so the additional delay due to 
> irq disabled for the whole period is insignificant.
>>
>>> If not, there may be NMIs mixed in.
>>>
>>
>> NMIs are definitely on my list of things to investigate.
>> These AMD CPUs also have other types of interrupts that needs a close 
>> look.
>>
>> The easier explaination is that the lock isn't "yielded" on every cycle
>> through the for each CPU loop.
>>
>> Lets look at the data I provided above:
>>
>> >>   Flushes(5033) 294/interval (avg 62/sec)
>> >>   Locks(53374) 1748/interval (avg 667/sec)
>> >>   Yields(48341) 1454/interval (avg 604/sec)
>> >>   Contended(48104) 1450/interval (avg 601/sec)
>>
>> In this 1 second sample, we have 294 flushes, and more yields 1454,
>> great but the factor is not 128 (num-of-CPUs) but closer to 5. Thus, on
>> average we hold the lock for (128/5) 25.6 CPUs-walks.
>>
>> We have spoken about releasing the lock on for_each CPU before... it
>> will likely solve this long hold time, but IMHO a mutex is still the
>> better solution.
> 
> I may have mistakenly thinking the lock hold time refers to just the 
> cpu_lock. Your reported times here are about the cgroup_rstat_lock. 
> Right? If so, the numbers make sense to me.
> 

True, my reported number here are about the cgroup_rstat_lock.
Glad to hear, we are more aligned then :-)

Given I just got some prod machines online with this patch
cgroup_rstat_cpu_lock tracepoints, I can give you some early results,
about hold-time for the cgroup_rstat_cpu_lock.

 From this oneliner bpftrace commands:

   sudo bpftrace -e '
          tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
            @start[tid]=nsecs; @cnt[probe]=count()}
          tracepoint:cgroup:cgroup_rstat_cpu_locked {
            $now=nsecs;
            if (args->contended) {
              @wait_per_cpu_ns=hist($now-@start[tid]); delete(@start[tid]);}
            @cnt[probe]=count(); @locked[tid]=$now}
          tracepoint:cgroup:cgroup_rstat_cpu_unlock {
            $now=nsecs;
            @locked_per_cpu_ns=hist($now-@locked[tid]); 
delete(@locked[tid]);
            @cnt[probe]=count()}
          interval:s:1 {time("%H:%M:%S "); print(@wait_per_cpu_ns);
            print(@locked_per_cpu_ns); print(@cnt); clear(@cnt);}'

Results from one 1 sec period:

13:39:55 @wait_per_cpu_ns:
[512, 1K)              3 | 
      |
[1K, 2K)              12 |@ 
      |
[2K, 4K)             390 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4K, 8K)              70 |@@@@@@@@@ 
      |
[8K, 16K)             24 |@@@ 
      |
[16K, 32K)           183 |@@@@@@@@@@@@@@@@@@@@@@@@ 
      |
[32K, 64K)            11 |@ 
      |

@locked_per_cpu_ns:
[256, 512)         75592 |@ 
      |
[512, 1K)        2537357 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1K, 2K)          528615 |@@@@@@@@@@ 
      |
[2K, 4K)          168519 |@@@ 
      |
[4K, 8K)          162039 |@@@ 
      |
[8K, 16K)         100730 |@@ 
      |
[16K, 32K)         42276 | 
      |
[32K, 64K)          1423 | 
      |
[64K, 128K)           89 | 
      |

  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_lock_contended]: 3 /sec
  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_unlock]: 3200  /sec
  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_locked]: 3200  /sec


So, we see "flush-code-path" per-CPU-holding @locked_per_cpu_ns isn't
exceeding 128 usec.

My latency requirements, to avoid RX-queue overflow, with 1024 slots,
running at 25 Gbit/s, is 27.6 usec with small packets, and 500 usec
(0.5ms) with MTU size packets.  This is very close to my latency
requirements.

--Jesper



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-03 14:00           ` Jesper Dangaard Brouer
@ 2024-05-03 14:30             ` Waiman Long
  2024-05-03 19:18             ` Shakeel Butt
  1 sibling, 0 replies; 21+ messages in thread
From: Waiman Long @ 2024-05-03 14:30 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, tj, hannes, lizefan.x, cgroups,
	yosryahmed
  Cc: netdev, linux-mm, shakeel.butt, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior


On 5/3/24 10:00, Jesper Dangaard Brouer wrote:
>> I may have mistakenly thinking the lock hold time refers to just the 
>> cpu_lock. Your reported times here are about the cgroup_rstat_lock. 
>> Right? If so, the numbers make sense to me.
>>
>
> True, my reported number here are about the cgroup_rstat_lock.
> Glad to hear, we are more aligned then 🙂
>
> Given I just got some prod machines online with this patch
> cgroup_rstat_cpu_lock tracepoints, I can give you some early results,
> about hold-time for the cgroup_rstat_cpu_lock.
>
> From this oneliner bpftrace commands:
>
>   sudo bpftrace -e '
>          tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>            @start[tid]=nsecs; @cnt[probe]=count()}
>          tracepoint:cgroup:cgroup_rstat_cpu_locked {
>            $now=nsecs;
>            if (args->contended) {
>              @wait_per_cpu_ns=hist($now-@start[tid]); 
> delete(@start[tid]);}
>            @cnt[probe]=count(); @locked[tid]=$now}
>          tracepoint:cgroup:cgroup_rstat_cpu_unlock {
>            $now=nsecs;
>            @locked_per_cpu_ns=hist($now-@locked[tid]); 
> delete(@locked[tid]);
>            @cnt[probe]=count()}
>          interval:s:1 {time("%H:%M:%S "); print(@wait_per_cpu_ns);
>            print(@locked_per_cpu_ns); print(@cnt); clear(@cnt);}'
>
> Results from one 1 sec period:
>
> 13:39:55 @wait_per_cpu_ns:
> [512, 1K)              3 |      |
> [1K, 2K)              12 |@      |
> [2K, 4K)             390 
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4K, 8K)              70 |@@@@@@@@@      |
> [8K, 16K)             24 |@@@      |
> [16K, 32K)           183 |@@@@@@@@@@@@@@@@@@@@@@@@      |
> [32K, 64K)            11 |@      |
>
> @locked_per_cpu_ns:
> [256, 512)         75592 |@      |
> [512, 1K)        2537357 
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [1K, 2K)          528615 |@@@@@@@@@@      |
> [2K, 4K)          168519 |@@@      |
> [4K, 8K)          162039 |@@@      |
> [8K, 16K)         100730 |@@      |
> [16K, 32K)         42276 |      |
> [32K, 64K)          1423 |      |
> [64K, 128K)           89 |      |
>
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_lock_contended]: 3 /sec
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_unlock]: 3200  /sec
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_locked]: 3200  /sec
>
>
> So, we see "flush-code-path" per-CPU-holding @locked_per_cpu_ns isn't
> exceeding 128 usec.
>
> My latency requirements, to avoid RX-queue overflow, with 1024 slots,
> running at 25 Gbit/s, is 27.6 usec with small packets, and 500 usec
> (0.5ms) with MTU size packets.  This is very close to my latency
> requirements. 

Thanks for sharing the data.

This is more aligned with what I would have expected. Still, a high up 
to 128 usec is still on the high side. I remembered during my latency 
testing when I worked on cpu_lock latency patch, it was in the 2 digit 
range. Perhaps there are other sources of noise or the update list is 
really long. Anyway, it may be a bit hard to reach the 27.6 usec target 
for small packets.

Cheers,
Longman



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-03 12:58       ` Jesper Dangaard Brouer
@ 2024-05-03 18:11         ` Shakeel Butt
  0 siblings, 0 replies; 21+ messages in thread
From: Shakeel Butt @ 2024-05-03 18:11 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed, netdev,
	linux-mm, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior

On Fri, May 03, 2024 at 02:58:56PM +0200, Jesper Dangaard Brouer wrote:
> 
> 
> On 02/05/2024 21.44, Shakeel Butt wrote:
> > On Wed, May 01, 2024 at 07:22:26PM +0200, Jesper Dangaard Brouer wrote:
> > > 
> > [...]
> > > 
> > > More data, the histogram of time spend under the lock have some strange
> > > variation issues with a group in 4ms to 65ms area. Investigating what
> > > can be causeing this... which next step depend in these tracepoints.
> > > 
> > > @lock_cnt: 759146
> > > 
> > > @locked_ns:
> > > [1K, 2K)             499 |      |
> > > [2K, 4K)          206928
> > > |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> > > [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> > > [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
> > > [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> > > [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
> > > [64K, 128K)        38359 |@@@@@@@@@      |
> > > [128K, 256K)       46597 |@@@@@@@@@@@      |
> > > [256K, 512K)       32466 |@@@@@@@@      |
> > > [512K, 1M)          3945 |      |
> > > [1M, 2M)             642 |      |
> > > [2M, 4M)             750 |      |
> > > [4M, 8M)            1932 |      |
> > > [8M, 16M)           2114 |      |
> > > [16M, 32M)          1039 |      |
> > > [32M, 64M)           108 |      |
> > > 
> > 
> > Am I understanding correctly that 1K is 1 microsecond and 1M is 1
> > millisecond?
> 
> Correct.
> 
> > Is it possible to further divide this table into update
> > side and flush side?
> > 
> 
> This is *only* flush side.
> 
> You question indicate, that we are talking past each-other ;-)
> 
> Measurements above is with (recently) accepted tracepoints (e.g. not the
> proposed tracepoints in this patch).  I'm arguing with existing
> tracepoint that I'm seeing this data, and arguing I need per-CPU
> tracepoints to dig deeper into this (as proposed in this patch).

Ah my mistake, I just assumed that the data shown is with the given
patchset.

> 
> The "update side" can only be measured once we apply this patch.
> 
> This morning I got 6 prod machines booted with new kernels, that contain
> this proposed per-CPU lock tracepoint patch.  And 3 of these machines have
> the Mutex lock change also.  No data to share yet...
> 

Eagerly waiting for the results. Also I don't have any concerns with
these new traces.

> --Jesper


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-03 14:00           ` Jesper Dangaard Brouer
  2024-05-03 14:30             ` Waiman Long
@ 2024-05-03 19:18             ` Shakeel Butt
  2024-05-06 12:03               ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 21+ messages in thread
From: Shakeel Butt @ 2024-05-03 19:18 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed, netdev,
	linux-mm, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior

On Fri, May 03, 2024 at 04:00:20PM +0200, Jesper Dangaard Brouer wrote:
> 
> 
[...]
> > 
> > I may have mistakenly thinking the lock hold time refers to just the
> > cpu_lock. Your reported times here are about the cgroup_rstat_lock.
> > Right? If so, the numbers make sense to me.
> > 
> 
> True, my reported number here are about the cgroup_rstat_lock.
> Glad to hear, we are more aligned then :-)
> 
> Given I just got some prod machines online with this patch
> cgroup_rstat_cpu_lock tracepoints, I can give you some early results,
> about hold-time for the cgroup_rstat_cpu_lock.

Oh you have already shared the preliminary data.

> 
> From this oneliner bpftrace commands:
> 
>   sudo bpftrace -e '
>          tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>            @start[tid]=nsecs; @cnt[probe]=count()}
>          tracepoint:cgroup:cgroup_rstat_cpu_locked {
>            $now=nsecs;
>            if (args->contended) {
>              @wait_per_cpu_ns=hist($now-@start[tid]); delete(@start[tid]);}
>            @cnt[probe]=count(); @locked[tid]=$now}
>          tracepoint:cgroup:cgroup_rstat_cpu_unlock {
>            $now=nsecs;
>            @locked_per_cpu_ns=hist($now-@locked[tid]); delete(@locked[tid]);
>            @cnt[probe]=count()}
>          interval:s:1 {time("%H:%M:%S "); print(@wait_per_cpu_ns);
>            print(@locked_per_cpu_ns); print(@cnt); clear(@cnt);}'
> 
> Results from one 1 sec period:
> 
> 13:39:55 @wait_per_cpu_ns:
> [512, 1K)              3 |      |
> [1K, 2K)              12 |@      |
> [2K, 4K)             390
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4K, 8K)              70 |@@@@@@@@@      |
> [8K, 16K)             24 |@@@      |
> [16K, 32K)           183 |@@@@@@@@@@@@@@@@@@@@@@@@      |
> [32K, 64K)            11 |@      |
> 
> @locked_per_cpu_ns:
> [256, 512)         75592 |@      |
> [512, 1K)        2537357
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [1K, 2K)          528615 |@@@@@@@@@@      |
> [2K, 4K)          168519 |@@@      |
> [4K, 8K)          162039 |@@@      |
> [8K, 16K)         100730 |@@      |
> [16K, 32K)         42276 |      |
> [32K, 64K)          1423 |      |
> [64K, 128K)           89 |      |
> 
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_lock_contended]: 3 /sec
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_unlock]: 3200  /sec
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_locked]: 3200  /sec
> 
> 
> So, we see "flush-code-path" per-CPU-holding @locked_per_cpu_ns isn't
> exceeding 128 usec.

Hmm 128 usec is actually unexpectedly high. How does the cgroup
hierarchy on your system looks like? How many cgroups have actual
workloads running? Can the network softirqs run on any cpus or smaller
set of cpus? I am assuming these softirqs are processing packets from
any or all cgroups and thus have larger cgroup update tree. I wonder if
you comment out MEMCG_SOCK stat update and still see the same holding
time.

> 
> My latency requirements, to avoid RX-queue overflow, with 1024 slots,
> running at 25 Gbit/s, is 27.6 usec with small packets, and 500 usec
> (0.5ms) with MTU size packets.  This is very close to my latency
> requirements.
> 
> --Jesper
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-03 19:18             ` Shakeel Butt
@ 2024-05-06 12:03               ` Jesper Dangaard Brouer
  2024-05-06 16:22                 ` Shakeel Butt
  0 siblings, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2024-05-06 12:03 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed, netdev,
	linux-mm, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior, Daniel Dao, Ivan Babrou, jr



On 03/05/2024 21.18, Shakeel Butt wrote:
> On Fri, May 03, 2024 at 04:00:20PM +0200, Jesper Dangaard Brouer wrote:
>>
>>
> [...]
>>>
>>> I may have mistakenly thinking the lock hold time refers to just the
>>> cpu_lock. Your reported times here are about the cgroup_rstat_lock.
>>> Right? If so, the numbers make sense to me.
>>>
>>
>> True, my reported number here are about the cgroup_rstat_lock.
>> Glad to hear, we are more aligned then :-)
>>
>> Given I just got some prod machines online with this patch
>> cgroup_rstat_cpu_lock tracepoints, I can give you some early results,
>> about hold-time for the cgroup_rstat_cpu_lock.
> 
> Oh you have already shared the preliminary data.
> 
>>
>>  From this oneliner bpftrace commands:
>>
>>    sudo bpftrace -e '
>>           tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>             @start[tid]=nsecs; @cnt[probe]=count()}
>>           tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>             $now=nsecs;
>>             if (args->contended) {
>>               @wait_per_cpu_ns=hist($now-@start[tid]); delete(@start[tid]);}
>>             @cnt[probe]=count(); @locked[tid]=$now}
>>           tracepoint:cgroup:cgroup_rstat_cpu_unlock {
>>             $now=nsecs;
>>             @locked_per_cpu_ns=hist($now-@locked[tid]); delete(@locked[tid]);
>>             @cnt[probe]=count()}
>>           interval:s:1 {time("%H:%M:%S "); print(@wait_per_cpu_ns);
>>             print(@locked_per_cpu_ns); print(@cnt); clear(@cnt);}'
>>
>> Results from one 1 sec period:
>>
>> 13:39:55 @wait_per_cpu_ns:
>> [512, 1K)              3 |      |
>> [1K, 2K)              12 |@      |
>> [2K, 4K)             390
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [4K, 8K)              70 |@@@@@@@@@      |
>> [8K, 16K)             24 |@@@      |
>> [16K, 32K)           183 |@@@@@@@@@@@@@@@@@@@@@@@@      |
>> [32K, 64K)            11 |@      |
>>
>> @locked_per_cpu_ns:
>> [256, 512)         75592 |@      |
>> [512, 1K)        2537357
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [1K, 2K)          528615 |@@@@@@@@@@      |
>> [2K, 4K)          168519 |@@@      |
>> [4K, 8K)          162039 |@@@      |
>> [8K, 16K)         100730 |@@      |
>> [16K, 32K)         42276 |      |
>> [32K, 64K)          1423 |      |
>> [64K, 128K)           89 |      |
>>
>>   @cnt[tracepoint:cgroup:cgroup_rstat_cpu_lock_contended]: 3 /sec
>>   @cnt[tracepoint:cgroup:cgroup_rstat_cpu_unlock]: 3200  /sec
>>   @cnt[tracepoint:cgroup:cgroup_rstat_cpu_locked]: 3200  /sec
>>
>>
>> So, we see "flush-code-path" per-CPU-holding @locked_per_cpu_ns isn't
>> exceeding 128 usec.
> 
> Hmm 128 usec is actually unexpectedly high. 

> How does the cgroup hierarchy on your system looks like? 
I didn't design this, so hopefully my co-workers can help me out here? 
(To @Daniel or @Jon)

My low level view is that, there are 17 top-level directories in 
/sys/fs/cgroup/.
There are 649 cgroups (counting occurrence of memory.stat).
There are two directories that contain the major part.
  - /sys/fs/cgroup/system.slice = 379
  - /sys/fs/cgroup/production.slice = 233
  - (production.slice have directory two levels)
  - remaining 37

We are open to changing this if you have any advice?
(@Daniel and @Jon are actually working on restructuring this)

> How many cgroups have actual workloads running?
Do you have a command line trick to determine this?


> Can the network softirqs run on any cpus or smaller
> set of cpus? I am assuming these softirqs are processing packets from
> any or all cgroups and thus have larger cgroup update tree. 

Softirq and specifically NET_RX is running half of the cores (e.g. 64).
(I'm looking at restructuring this allocation)

> I wonder if
> you comment out MEMCG_SOCK stat update and still see the same holding
> time.
>

It doesn't look like MEMCG_SOCK is used.

I deduct you are asking:
  - What is the update count for different types of mod_memcg_state() calls?

// Dumped via BTF info
enum memcg_stat_item {
         MEMCG_SWAP = 43,
         MEMCG_SOCK = 44,
         MEMCG_PERCPU_B = 45,
         MEMCG_VMALLOC = 46,
         MEMCG_KMEM = 47,
         MEMCG_ZSWAP_B = 48,
         MEMCG_ZSWAPPED = 49,
         MEMCG_NR_STAT = 50,
};

sudo bpftrace -e 'kfunc:vmlinux:__mod_memcg_state{@[args->idx]=count()} 
END{printf("\nEND time elapsed: %d sec\n", elapsed / 1000000000);}'
Attaching 2 probes...
^C
END time elapsed: 99 sec

@[45]: 17996
@[46]: 18603
@[43]: 61858
@[47]: 21398919

It seems clear that MEMCG_KMEM = 47 is the main "user".
  - 21398919/99 = 216150 calls per sec

Could someone explain to me what this MEMCG_KMEM is used for?


>>
>> My latency requirements, to avoid RX-queue overflow, with 1024 slots,
>> running at 25 Gbit/s, is 27.6 usec with small packets, and 500 usec
>> (0.5ms) with MTU size packets.  This is very close to my latency
>> requirements.
>>
>> --Jesper
>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-06 12:03               ` Jesper Dangaard Brouer
@ 2024-05-06 16:22                 ` Shakeel Butt
  2024-05-06 16:28                   ` Ivan Babrou
  2024-05-06 19:54                   ` Jesper Dangaard Brouer
  0 siblings, 2 replies; 21+ messages in thread
From: Shakeel Butt @ 2024-05-06 16:22 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed, netdev,
	linux-mm, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior, Daniel Dao, Ivan Babrou, jr

On Mon, May 06, 2024 at 02:03:47PM +0200, Jesper Dangaard Brouer wrote:
> 
> 
> On 03/05/2024 21.18, Shakeel Butt wrote:
[...]
> > 
> > Hmm 128 usec is actually unexpectedly high.
> 
> > How does the cgroup hierarchy on your system looks like?
> I didn't design this, so hopefully my co-workers can help me out here? (To
> @Daniel or @Jon)
> 
> My low level view is that, there are 17 top-level directories in
> /sys/fs/cgroup/.
> There are 649 cgroups (counting occurrence of memory.stat).
> There are two directories that contain the major part.
>  - /sys/fs/cgroup/system.slice = 379
>  - /sys/fs/cgroup/production.slice = 233
>  - (production.slice have directory two levels)
>  - remaining 37
> 
> We are open to changing this if you have any advice?
> (@Daniel and @Jon are actually working on restructuring this)
> 
> > How many cgroups have actual workloads running?
> Do you have a command line trick to determine this?
> 

The rstat infra maintains a per-cpu cgroup update tree to only flush
stats of cgroups which have seen updates. So, even if you have large
number of cgroups but the workload is active in small number of cgroups,
the update tree should be much smaller. That is the reason I asked these
questions. I don't have any advise yet. At the I am trying to understand
the usage and then hopefully work on optimizing those.

> 
> > Can the network softirqs run on any cpus or smaller
> > set of cpus? I am assuming these softirqs are processing packets from
> > any or all cgroups and thus have larger cgroup update tree.
> 
> Softirq and specifically NET_RX is running half of the cores (e.g. 64).
> (I'm looking at restructuring this allocation)
> 
> > I wonder if
> > you comment out MEMCG_SOCK stat update and still see the same holding
> > time.
> > 
> 
> It doesn't look like MEMCG_SOCK is used.
> 
> I deduct you are asking:
>  - What is the update count for different types of mod_memcg_state() calls?
> 
> // Dumped via BTF info
> enum memcg_stat_item {
>         MEMCG_SWAP = 43,
>         MEMCG_SOCK = 44,
>         MEMCG_PERCPU_B = 45,
>         MEMCG_VMALLOC = 46,
>         MEMCG_KMEM = 47,
>         MEMCG_ZSWAP_B = 48,
>         MEMCG_ZSWAPPED = 49,
>         MEMCG_NR_STAT = 50,
> };
> 
> sudo bpftrace -e 'kfunc:vmlinux:__mod_memcg_state{@[args->idx]=count()}
> END{printf("\nEND time elapsed: %d sec\n", elapsed / 1000000000);}'
> Attaching 2 probes...
> ^C
> END time elapsed: 99 sec
> 
> @[45]: 17996
> @[46]: 18603
> @[43]: 61858
> @[47]: 21398919
> 
> It seems clear that MEMCG_KMEM = 47 is the main "user".
>  - 21398919/99 = 216150 calls per sec
> 
> Could someone explain to me what this MEMCG_KMEM is used for?
> 

MEMCG_KMEM is the kernel memory charged to a cgroup. It also contains
the untyped kernel memory which is not included in kernel_stack,
pagetables, percpu, vmalloc, slab e.t.c.

The reason I asked about MEMCG_SOCK was that it might be causing larger
update trees (more cgroups) on CPUs processing the NET_RX.

Anyways did the mutex change helped your production workload regarding
latencies?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-06 16:22                 ` Shakeel Butt
@ 2024-05-06 16:28                   ` Ivan Babrou
  2024-05-06 19:45                     ` Shakeel Butt
  2024-05-06 19:54                   ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 21+ messages in thread
From: Ivan Babrou @ 2024-05-06 16:28 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Jesper Dangaard Brouer, Waiman Long, tj, hannes, lizefan.x,
	cgroups, yosryahmed, netdev, linux-mm, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior, Daniel Dao,
	jr

On Mon, May 6, 2024 at 9:22 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Mon, May 06, 2024 at 02:03:47PM +0200, Jesper Dangaard Brouer wrote:
> >
> >
> > On 03/05/2024 21.18, Shakeel Butt wrote:
> [...]
> > >
> > > Hmm 128 usec is actually unexpectedly high.
> >
> > > How does the cgroup hierarchy on your system looks like?
> > I didn't design this, so hopefully my co-workers can help me out here? (To
> > @Daniel or @Jon)
> >
> > My low level view is that, there are 17 top-level directories in
> > /sys/fs/cgroup/.
> > There are 649 cgroups (counting occurrence of memory.stat).
> > There are two directories that contain the major part.
> >  - /sys/fs/cgroup/system.slice = 379
> >  - /sys/fs/cgroup/production.slice = 233
> >  - (production.slice have directory two levels)
> >  - remaining 37
> >
> > We are open to changing this if you have any advice?
> > (@Daniel and @Jon are actually working on restructuring this)
> >
> > > How many cgroups have actual workloads running?
> > Do you have a command line trick to determine this?
> >
>
> The rstat infra maintains a per-cpu cgroup update tree to only flush
> stats of cgroups which have seen updates. So, even if you have large
> number of cgroups but the workload is active in small number of cgroups,
> the update tree should be much smaller. That is the reason I asked these
> questions. I don't have any advise yet. At the I am trying to understand
> the usage and then hopefully work on optimizing those.
>
> >
> > > Can the network softirqs run on any cpus or smaller
> > > set of cpus? I am assuming these softirqs are processing packets from
> > > any or all cgroups and thus have larger cgroup update tree.
> >
> > Softirq and specifically NET_RX is running half of the cores (e.g. 64).
> > (I'm looking at restructuring this allocation)
> >
> > > I wonder if
> > > you comment out MEMCG_SOCK stat update and still see the same holding
> > > time.
> > >
> >
> > It doesn't look like MEMCG_SOCK is used.
> >
> > I deduct you are asking:
> >  - What is the update count for different types of mod_memcg_state() calls?
> >
> > // Dumped via BTF info
> > enum memcg_stat_item {
> >         MEMCG_SWAP = 43,
> >         MEMCG_SOCK = 44,
> >         MEMCG_PERCPU_B = 45,
> >         MEMCG_VMALLOC = 46,
> >         MEMCG_KMEM = 47,
> >         MEMCG_ZSWAP_B = 48,
> >         MEMCG_ZSWAPPED = 49,
> >         MEMCG_NR_STAT = 50,
> > };
> >
> > sudo bpftrace -e 'kfunc:vmlinux:__mod_memcg_state{@[args->idx]=count()}
> > END{printf("\nEND time elapsed: %d sec\n", elapsed / 1000000000);}'
> > Attaching 2 probes...
> > ^C
> > END time elapsed: 99 sec
> >
> > @[45]: 17996
> > @[46]: 18603
> > @[43]: 61858
> > @[47]: 21398919
> >
> > It seems clear that MEMCG_KMEM = 47 is the main "user".
> >  - 21398919/99 = 216150 calls per sec
> >
> > Could someone explain to me what this MEMCG_KMEM is used for?
> >
>
> MEMCG_KMEM is the kernel memory charged to a cgroup. It also contains
> the untyped kernel memory which is not included in kernel_stack,
> pagetables, percpu, vmalloc, slab e.t.c.
>
> The reason I asked about MEMCG_SOCK was that it might be causing larger
> update trees (more cgroups) on CPUs processing the NET_RX.

We pass cgroup.memory=nosocket in the kernel cmdline:

* https://lore.kernel.org/lkml/CABWYdi0G7cyNFbndM-ELTDAR3x4Ngm0AehEp5aP0tfNkXUE+Uw@mail.gmail.com/

> Anyways did the mutex change helped your production workload regarding
> latencies?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-06 16:28                   ` Ivan Babrou
@ 2024-05-06 19:45                     ` Shakeel Butt
  0 siblings, 0 replies; 21+ messages in thread
From: Shakeel Butt @ 2024-05-06 19:45 UTC (permalink / raw)
  To: Ivan Babrou
  Cc: Jesper Dangaard Brouer, Waiman Long, tj, hannes, lizefan.x,
	cgroups, yosryahmed, netdev, linux-mm, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior, Daniel Dao,
	jr

On Mon, May 06, 2024 at 09:28:41AM -0700, Ivan Babrou wrote:
> On Mon, May 6, 2024 at 9:22 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
[...]
> >
> > The reason I asked about MEMCG_SOCK was that it might be causing larger
> > update trees (more cgroups) on CPUs processing the NET_RX.
> 
> We pass cgroup.memory=nosocket in the kernel cmdline:
> 
> * https://lore.kernel.org/lkml/CABWYdi0G7cyNFbndM-ELTDAR3x4Ngm0AehEp5aP0tfNkXUE+Uw@mail.gmail.com/
> 

Ah another thing we should fix for you folks. Is it possible to repro
the issue on the latest linus tree and report back? I will be busy for
next 2 weeks but will get back to this after that.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-06 16:22                 ` Shakeel Butt
  2024-05-06 16:28                   ` Ivan Babrou
@ 2024-05-06 19:54                   ` Jesper Dangaard Brouer
  1 sibling, 0 replies; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2024-05-06 19:54 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Waiman Long, tj, hannes, lizefan.x, cgroups, yosryahmed, netdev,
	linux-mm, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior, Daniel Dao, Ivan Babrou, jr

[-- Attachment #1: Type: text/plain, Size: 458 bytes --]


On 06/05/2024 18.22, Shakeel Butt wrote:
> Anyways did the mutex change helped your production workload regarding
> latencies?

Yes, see attached NET_RX softirq latency improvement with mutex patch[1].
Showing two grafana heatmaps over softirq wait (no-able-to run) time.
It is very clear that most of the latency spikes are gone with mutex 
patch applied.

--Jesper

[1] 
https://lore.kernel.org/all/171328989335.3930751.3091577850420501533.stgit@firesoul/

[-- Attachment #2: softirq_wait_time_compare01.png --]
[-- Type: image/png, Size: 119601 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-01 14:04 [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints Jesper Dangaard Brouer
  2024-05-01 14:24 ` Waiman Long
@ 2024-05-14  5:18 ` Jesper Dangaard Brouer
  2024-05-14  5:55   ` Tejun Heo
  2024-05-15 16:58 ` Tejun Heo
  2 siblings, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2024-05-14  5:18 UTC (permalink / raw)
  To: tj, hannes, lizefan.x, cgroups, yosryahmed, longman
  Cc: netdev, linux-mm, shakeel.butt, kernel-team,
	Arnaldo Carvalho de Melo, Sebastian Andrzej Siewior

Hi Tejun,

Could we please apply this for-6.10, to avoid splitting tracepoint 
changes over multiple kernel versions?

--Jesper
P.s. Cloudflare is already running with this applied in production.


On 01/05/2024 16.04, Jesper Dangaard Brouer wrote:
> This closely resembles helpers added for the global cgroup_rstat_lock in
> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
> 
> Based on production workloads, we observe the fast-path "update" function
> cgroup_rstat_updated() is invoked around 3 million times per sec, while the
> "flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
> can see periodic spikes of 700 invocations/sec.
> 
> For this reason, the tracepoints are split into normal and fastpath
> versions for this per-CPU lock. Making it feasible for production to
> continuously monitor the non-fastpath tracepoint to detect lock contention
> issues. The reason for monitoring is that lock disables IRQs which can
> disturb e.g. softirq processing on the local CPUs involved. When the
> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
> this per CPU lock becomes the next bottleneck that can introduce latency
> variations.
> 
> A practical bpftrace script for monitoring contention latency:
> 
>   bpftrace -e '
>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>       @start[tid]=nsecs; @cnt[probe]=count()}
>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>       if (args->contended) {
>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>       @cnt[probe]=count()}
>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'
> 
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> ---
>   include/trace/events/cgroup.h |   56 +++++++++++++++++++++++++++++----
>   kernel/cgroup/rstat.c         |   70 ++++++++++++++++++++++++++++++++++-------
>   2 files changed, 108 insertions(+), 18 deletions(-)
> 
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index 13f375800135..0b95865a90f3 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>   
>   DECLARE_EVENT_CLASS(cgroup_rstat,
>   
> -	TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>   
> -	TP_ARGS(cgrp, cpu_in_loop, contended),
> +	TP_ARGS(cgrp, cpu, contended),
>   
>   	TP_STRUCT__entry(
>   		__field(	int,		root			)
>   		__field(	int,		level			)
>   		__field(	u64,		id			)
> -		__field(	int,		cpu_in_loop		)
> +		__field(	int,		cpu			)
>   		__field(	bool,		contended		)
>   	),
>   
> @@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>   		__entry->root = cgrp->root->hierarchy_id;
>   		__entry->id = cgroup_id(cgrp);
>   		__entry->level = cgrp->level;
> -		__entry->cpu_in_loop = cpu_in_loop;
> +		__entry->cpu = cpu;
>   		__entry->contended = contended;
>   	),
>   
> -	TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
> +	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
>   		  __entry->root, __entry->id, __entry->level,
> -		  __entry->cpu_in_loop, __entry->contended)
> +		  __entry->cpu, __entry->contended)
>   );
>   
> +/* Related to global: cgroup_rstat_lock */
>   DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>   
>   	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> @@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>   	TP_ARGS(cgrp, cpu, contended)
>   );
>   
> +/* Related to per CPU: cgroup_rstat_cpu_lock */
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
>   #endif /* _TRACE_CGROUP_H */
>   
>   /* This part must be outside protection */
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 52e3b0ed1cee..fb8b49437573 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>   	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>   }
>   
> +/*
> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
> + *
> + * This makes it easier to diagnose locking issues and contention in
> + * production environments. The parameter @fast_path determine the
> + * tracepoints being added, allowing us to diagnose "flush" related
> + * operations without handling high-frequency fast-path "update" events.
> + */
> +static __always_inline
> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
> +				     struct cgroup *cgrp, const bool fast_path)
> +{
> +	unsigned long flags;
> +	bool contended;
> +
> +	/*
> +	 * The _irqsave() is needed because cgroup_rstat_lock is
> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> +	 * this lock with the _irq() suffix only disables interrupts on
> +	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> +	 * interrupts on both configurations. The _irqsave() ensures
> +	 * that interrupts are always disabled and later restored.
> +	 */
> +	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
> +	if (contended) {
> +		if (fast_path)
> +			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
> +		else
> +			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
> +
> +		raw_spin_lock_irqsave(cpu_lock, flags);
> +	}
> +
> +	if (fast_path)
> +		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
> +	else
> +		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
> +
> +	return flags;
> +}
> +
> +static __always_inline
> +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
> +			      struct cgroup *cgrp, unsigned long flags,
> +			      const bool fast_path)
> +{
> +	if (fast_path)
> +		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
> +	else
> +		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
> +
> +	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +}
> +
>   /**
>    * cgroup_rstat_updated - keep track of updated rstat_cpu
>    * @cgrp: target cgroup
> @@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
>   	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
>   		return;
>   
> -	raw_spin_lock_irqsave(cpu_lock, flags);
> +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>   
>   	/* put @cgrp and all ancestors on the corresponding updated lists */
>   	while (true) {
> @@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
>   		cgrp = parent;
>   	}
>   
> -	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
>   }
>   
>   /**
> @@ -153,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>   	struct cgroup *head = NULL, *parent, *child;
>   	unsigned long flags;
>   
> -	/*
> -	 * The _irqsave() is needed because cgroup_rstat_lock is
> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> -	 * this lock with the _irq() suffix only disables interrupts on
> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> -	 * interrupts on both configurations. The _irqsave() ensures
> -	 * that interrupts are always disabled and later restored.
> -	 */
> -	raw_spin_lock_irqsave(cpu_lock, flags);
> +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
>   
>   	/* Return NULL if this subtree is not on-list */
>   	if (!rstatc->updated_next)
> @@ -198,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>   	if (child != root)
>   		head = cgroup_rstat_push_children(head, child, cpu);
>   unlock_ret:
> -	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
>   	return head;
>   }
>   
> 
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-14  5:18 ` Jesper Dangaard Brouer
@ 2024-05-14  5:55   ` Tejun Heo
  2024-05-14 16:59     ` Waiman Long
  0 siblings, 1 reply; 21+ messages in thread
From: Tejun Heo @ 2024-05-14  5:55 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: hannes, lizefan.x, cgroups, yosryahmed, longman, netdev, linux-mm,
	shakeel.butt, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior

On Tue, May 14, 2024 at 07:18:18AM +0200, Jesper Dangaard Brouer wrote:
> Hi Tejun,
> 
> Could we please apply this for-6.10, to avoid splitting tracepoint changes
> over multiple kernel versions?

Yeah, I can. Waiman, would that be okay?

Thanks.

-- 
tejun


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-14  5:55   ` Tejun Heo
@ 2024-05-14 16:59     ` Waiman Long
  0 siblings, 0 replies; 21+ messages in thread
From: Waiman Long @ 2024-05-14 16:59 UTC (permalink / raw)
  To: Tejun Heo, Jesper Dangaard Brouer
  Cc: hannes, lizefan.x, cgroups, yosryahmed, netdev, linux-mm,
	shakeel.butt, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior


On 5/14/24 01:55, Tejun Heo wrote:
> On Tue, May 14, 2024 at 07:18:18AM +0200, Jesper Dangaard Brouer wrote:
>> Hi Tejun,
>>
>> Could we please apply this for-6.10, to avoid splitting tracepoint changes
>> over multiple kernel versions?
> Yeah, I can. Waiman, would that be okay?

I am OK with this commit as it shouldn't impose that much of a 
performance cost if the tracepoints are activated.

Cheers,
Longman



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
  2024-05-01 14:04 [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints Jesper Dangaard Brouer
  2024-05-01 14:24 ` Waiman Long
  2024-05-14  5:18 ` Jesper Dangaard Brouer
@ 2024-05-15 16:58 ` Tejun Heo
  2 siblings, 0 replies; 21+ messages in thread
From: Tejun Heo @ 2024-05-15 16:58 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: hannes, lizefan.x, cgroups, yosryahmed, longman, netdev, linux-mm,
	shakeel.butt, kernel-team, Arnaldo Carvalho de Melo,
	Sebastian Andrzej Siewior

On Wed, May 01, 2024 at 04:04:11PM +0200, Jesper Dangaard Brouer wrote:
> This closely resembles helpers added for the global cgroup_rstat_lock in
> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
> 
> Based on production workloads, we observe the fast-path "update" function
> cgroup_rstat_updated() is invoked around 3 million times per sec, while the
> "flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
> can see periodic spikes of 700 invocations/sec.
> 
> For this reason, the tracepoints are split into normal and fastpath
> versions for this per-CPU lock. Making it feasible for production to
> continuously monitor the non-fastpath tracepoint to detect lock contention
> issues. The reason for monitoring is that lock disables IRQs which can
> disturb e.g. softirq processing on the local CPUs involved. When the
> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
> this per CPU lock becomes the next bottleneck that can introduce latency
> variations.
> 
> A practical bpftrace script for monitoring contention latency:
> 
>  bpftrace -e '
>    tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>      @start[tid]=nsecs; @cnt[probe]=count()}
>    tracepoint:cgroup:cgroup_rstat_cpu_locked {
>      if (args->contended) {
>        @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>      @cnt[probe]=count()}
>    interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'
> 
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>

Applied to cgroup/for-6.10.

Thanks.

-- 
tejun


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2024-05-15 16:58 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-01 14:04 [PATCH v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints Jesper Dangaard Brouer
2024-05-01 14:24 ` Waiman Long
2024-05-01 17:22   ` Jesper Dangaard Brouer
2024-05-01 18:41     ` Waiman Long
2024-05-02 11:23       ` Jesper Dangaard Brouer
2024-05-02 18:19         ` Waiman Long
2024-05-03 14:00           ` Jesper Dangaard Brouer
2024-05-03 14:30             ` Waiman Long
2024-05-03 19:18             ` Shakeel Butt
2024-05-06 12:03               ` Jesper Dangaard Brouer
2024-05-06 16:22                 ` Shakeel Butt
2024-05-06 16:28                   ` Ivan Babrou
2024-05-06 19:45                     ` Shakeel Butt
2024-05-06 19:54                   ` Jesper Dangaard Brouer
2024-05-02 19:44     ` Shakeel Butt
2024-05-03 12:58       ` Jesper Dangaard Brouer
2024-05-03 18:11         ` Shakeel Butt
2024-05-14  5:18 ` Jesper Dangaard Brouer
2024-05-14  5:55   ` Tejun Heo
2024-05-14 16:59     ` Waiman Long
2024-05-15 16:58 ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).