[PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots

linux-trace-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots
@ 2025-03-14 22:07 Libo Chen
  2025-03-23 16:01 ` Yordan Karadzhov
  0 siblings, 1 reply; 4+ messages in thread
From: Libo Chen @ 2025-03-14 22:07 UTC (permalink / raw)
  To: y.karadz; +Cc: linux-trace-devel

Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
speed up graph rendering particularly for traces from very large systems.

OpenMP technically is a new dependency here, but it's part of GCC, so long
as your GCC >= v4.9, the libgomp library will make the code compiled.

Signed-off-by: Libo Chen <libo.chen@oracle.com>
---
 CMakeLists.txt     |  6 ++++++
 src/KsGLWidget.cpp | 25 +++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 988bfd6..7847177 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
 set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
 
+find_package(OpenMP 3.2.5)
+if (OPENMP_FOUND)
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif(OPENMP_FOUND)
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
index 9311d98..004d64b 100644
--- a/src/KsGLWidget.cpp
+++ b/src/KsGLWidget.cpp
@@ -13,6 +13,9 @@
 #include <GL/glut.h>
 #include <GL/gl.h>
 
+// OpenMP
+#include <omp.h>
+
 // KernelShark
 #include "libkshark-plugin.h"
 #include "KsGLWidget.hpp"
@@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs()
 		return graph;
 	};
 
+	omp_set_num_threads(omp_get_num_procs());
 	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
 		sd = it.key();
+		QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
+		QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
+
 		/* Create CPU graphs according to the cpuList. */
 		it.value()._cpuGraphs = {};
+		#pragma omp parallel for
 		for (auto const &cpu: it.value()._cpuList) {
-			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
+			int idx = it.value()._cpuList.indexOf(cpu);
+			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
+		}
+		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
+		while (itCpuGraphs.hasNext()) {
+			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
 			it.value()._cpuGraphs.append(g);
 		}
 
 		/* Create Task graphs according to the taskList. */
 		it.value()._taskGraphs = {};
+		#pragma omp parallel for
 		for (auto const &pid: it.value()._taskList) {
-			g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
+			int idx = it.value()._taskList.indexOf(pid);
+			taskGraphs[idx] = _newTaskGraph(sd, pid);
+		}
+		QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
+		while (itTaskGraphs.hasNext()) {
+			g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
 			it.value()._taskGraphs.append(g);
 		}
+
 	}
 
 	for (auto &c: _comboPlots) {
 		int n = c.count();
+		#pragma omp parallel for
 		for (int i = 0; i < n; ++i) {
 			sd = c[i]._streamId;
 			if (c[i]._type & KSHARK_TASK_DRAW) {
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots
  2025-03-14 22:07 [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots Libo Chen
@ 2025-03-23 16:01 ` Yordan Karadzhov
  2025-03-24 10:08   ` Libo Chen
  0 siblings, 1 reply; 4+ messages in thread
From: Yordan Karadzhov @ 2025-03-23 16:01 UTC (permalink / raw)
  To: Libo Chen; +Cc: linux-trace-devel

Hi Libo,
Please see my comments below.

On 3/15/25 00:07, Libo Chen wrote:
> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
> speed up graph rendering particularly for traces from very large systems.
> 
> OpenMP technically is a new dependency here, but it's part of GCC, so long
> as your GCC >= v4.9, the libgomp library will make the code compiled.
> 
> Signed-off-by: Libo Chen <libo.chen@oracle.com>
> ---
>   CMakeLists.txt     |  6 ++++++
>   src/KsGLWidget.cpp | 25 +++++++++++++++++++++++--
>   2 files changed, 29 insertions(+), 2 deletions(-)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index 988bfd6..7847177 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>   
> +find_package(OpenMP 3.2.5)
> +if (OPENMP_FOUND)
> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
> +endif(OPENMP_FOUND)
> +
>   set(CMAKE_CXX_STANDARD 17)
>   set(CMAKE_CXX_STANDARD_REQUIRED ON)
>   set(CMAKE_CXX_EXTENSIONS OFF)
> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
> index 9311d98..004d64b 100644
> --- a/src/KsGLWidget.cpp
> +++ b/src/KsGLWidget.cpp
> @@ -13,6 +13,9 @@
>   #include <GL/glut.h>
>   #include <GL/gl.h>
>   
> +// OpenMP
> +#include <omp.h>
> +
>   // KernelShark
>   #include "libkshark-plugin.h"
>   #include "KsGLWidget.hpp"
> @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs()
>   		return graph;
>   	};
>   
> +	omp_set_num_threads(omp_get_num_procs());
I think I already asked you to check if it is possible to move this to 
the constructor of the widget so that it is called just once. If there 
is some reason why this is not possible, at least provide some explanation.

>   	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>   		sd = it.key();
> +		QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
> +		QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
> +
>   		/* Create CPU graphs according to the cpuList. */
>   		it.value()._cpuGraphs = {};
> +		#pragma omp parallel for
>   		for (auto const &cpu: it.value()._cpuList) {
> -			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
> +			int idx = it.value()._cpuList.indexOf(cpu);

Maybe I do not understand what you want to do here, but this looks 
over-complicated to me. Isn't it equivalent to having simply

		for (size_t idx = 0; idx < nCpus; ++idx) {
	 		int cpu = it.value()._cpuList[idx];

The same comment applies for the other loop below.

> +			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
> +		}
> +		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
> +		while (itCpuGraphs.hasNext()) {
> +			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>   			it.value()._cpuGraphs.append(g);
>   		}
>   
>   		/* Create Task graphs according to the taskList. */
>   		it.value()._taskGraphs = {};
> +		#pragma omp parallel for
>   		for (auto const &pid: it.value()._taskList) {
> -			g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
> +			int idx = it.value()._taskList.indexOf(pid);
> +			taskGraphs[idx] = _newTaskGraph(sd, pid);
> +		}
> +		QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
> +		while (itTaskGraphs.hasNext()) {
> +			g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
>   			it.value()._taskGraphs.append(g);
>   		}
> +
Please remove this empty line.

Beside those minor things, the patch looks good to me. Please address 
the comments and I will be happy to apply your patch.

Thanks for helping us improve KerrnelShark!

Cheers,
Yordan
>   	}
>   
>   	for (auto &c: _comboPlots) {
>   		int n = c.count();
> +		#pragma omp parallel for
>   		for (int i = 0; i < n; ++i) {
>   			sd = c[i]._streamId;
>   			if (c[i]._type & KSHARK_TASK_DRAW) {

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots
  2025-03-23 16:01 ` Yordan Karadzhov
@ 2025-03-24 10:08   ` Libo Chen
  2025-03-26 23:46     ` Libo Chen
  0 siblings, 1 reply; 4+ messages in thread
From: Libo Chen @ 2025-03-24 10:08 UTC (permalink / raw)
  To: Yordan Karadzhov; +Cc: linux-trace-devel



On 3/23/25 09:01, Yordan Karadzhov wrote:
> Hi Libo,
> Please see my comments below.
> 
> On 3/15/25 00:07, Libo Chen wrote:
>> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
>> speed up graph rendering particularly for traces from very large systems.
>>
>> OpenMP technically is a new dependency here, but it's part of GCC, so long
>> as your GCC >= v4.9, the libgomp library will make the code compiled.
>>
>> Signed-off-by: Libo Chen <libo.chen@oracle.com>
>> ---
>>   CMakeLists.txt     |  6 ++++++
>>   src/KsGLWidget.cpp | 25 +++++++++++++++++++++++--
>>   2 files changed, 29 insertions(+), 2 deletions(-)
>>
>> diff --git a/CMakeLists.txt b/CMakeLists.txt
>> index 988bfd6..7847177 100644
>> --- a/CMakeLists.txt
>> +++ b/CMakeLists.txt
>> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>>   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>   +find_package(OpenMP 3.2.5)
>> +if (OPENMP_FOUND)
>> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
>> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
>> +endif(OPENMP_FOUND)
>> +
>>   set(CMAKE_CXX_STANDARD 17)
>>   set(CMAKE_CXX_STANDARD_REQUIRED ON)
>>   set(CMAKE_CXX_EXTENSIONS OFF)
>> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
>> index 9311d98..004d64b 100644
>> --- a/src/KsGLWidget.cpp
>> +++ b/src/KsGLWidget.cpp
>> @@ -13,6 +13,9 @@
>>   #include <GL/glut.h>
>>   #include <GL/gl.h>
>>   +// OpenMP
>> +#include <omp.h>
>> +
>>   // KernelShark
>>   #include "libkshark-plugin.h"
>>   #include "KsGLWidget.hpp"
>> @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs()
>>           return graph;
>>       };
>>   +    omp_set_num_threads(omp_get_num_procs());
> I think I already asked you to check if it is possible to move this to the constructor of the widget so that it is called just once. If there is some reason why this is not possible, at least provide some explanation.
Hi Yordan, thanks for your review.

oops I missed that, will move it over.
> 
>>       for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>>           sd = it.key();
>> +        QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
>> +        QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
>> +
>>           /* Create CPU graphs according to the cpuList. */
>>           it.value()._cpuGraphs = {};
>> +        #pragma omp parallel for
>>           for (auto const &cpu: it.value()._cpuList) {
>> -            g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
>> +            int idx = it.value()._cpuList.indexOf(cpu);
> 
> Maybe I do not understand what you want to do here, but this looks over-complicated to me. Isn't it equivalent to having simply
> 
>         for (size_t idx = 0; idx < nCpus; ++idx) {
>              int cpu = it.value()._cpuList[idx];
> 
I think _cpuList isn't exactly the same as [0..nCpus) here. In a default plot, some idle cpus may not be appended to the vector

	/* Do not add plots for idle CPUs. */
	if (!kshark_hash_id_find(stream->idle_cpus, cpu))
		plotVec.append(cpu);

Also you can set a subset of CPUs to show, so _cpuList could be quite random. 


Best,
Libo
> The same comment applies for the other loop below.
> 
>> +            cpuGraphs[idx] = _newCPUGraph(sd, cpu);
>> +        }
>> +        QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
>> +        while (itCpuGraphs.hasNext()) {
>> +            g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>>               it.value()._cpuGraphs.append(g);
>>           }
>>             /* Create Task graphs according to the taskList. */
>>           it.value()._taskGraphs = {};
>> +        #pragma omp parallel for
>>           for (auto const &pid: it.value()._taskList) {
>> -            g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
>> +            int idx = it.value()._taskList.indexOf(pid);
>> +            taskGraphs[idx] = _newTaskGraph(sd, pid);
>> +        }
>> +        QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
>> +        while (itTaskGraphs.hasNext()) {
>> +            g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
>>               it.value()._taskGraphs.append(g);
>>           }
>> +
> Please remove this empty line.
> 
> Beside those minor things, the patch looks good to me. Please address the comments and I will be happy to apply your patch.
> 
> Thanks for helping us improve KerrnelShark!
> 
> Cheers,
> Yordan
>>       }
>>         for (auto &c: _comboPlots) {
>>           int n = c.count();
>> +        #pragma omp parallel for
>>           for (int i = 0; i < n; ++i) {
>>               sd = c[i]._streamId;
>>               if (c[i]._type & KSHARK_TASK_DRAW) {


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots
  2025-03-24 10:08   ` Libo Chen
@ 2025-03-26 23:46     ` Libo Chen
  0 siblings, 0 replies; 4+ messages in thread
From: Libo Chen @ 2025-03-26 23:46 UTC (permalink / raw)
  To: Yordan Karadzhov; +Cc: linux-trace-devel



On 3/24/25 03:08, Libo Chen wrote:
> 
> 
> On 3/23/25 09:01, Yordan Karadzhov wrote:
>> Hi Libo,
>> Please see my comments below.
>>
>> On 3/15/25 00:07, Libo Chen wrote:
>>> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
>>> speed up graph rendering particularly for traces from very large systems.
>>>
>>> OpenMP technically is a new dependency here, but it's part of GCC, so long
>>> as your GCC >= v4.9, the libgomp library will make the code compiled.
>>>
>>> Signed-off-by: Libo Chen <libo.chen@oracle.com>
>>> ---
>>>   CMakeLists.txt     |  6 ++++++
>>>   src/KsGLWidget.cpp | 25 +++++++++++++++++++++++--
>>>   2 files changed, 29 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/CMakeLists.txt b/CMakeLists.txt
>>> index 988bfd6..7847177 100644
>>> --- a/CMakeLists.txt
>>> +++ b/CMakeLists.txt
>>> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>>>   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>>   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>>   +find_package(OpenMP 3.2.5)
>>> +if (OPENMP_FOUND)
>>> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
>>> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
>>> +endif(OPENMP_FOUND)
>>> +
>>>   set(CMAKE_CXX_STANDARD 17)
>>>   set(CMAKE_CXX_STANDARD_REQUIRED ON)
>>>   set(CMAKE_CXX_EXTENSIONS OFF)
>>> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
>>> index 9311d98..004d64b 100644
>>> --- a/src/KsGLWidget.cpp
>>> +++ b/src/KsGLWidget.cpp
>>> @@ -13,6 +13,9 @@
>>>   #include <GL/glut.h>
>>>   #include <GL/gl.h>
>>>   +// OpenMP
>>> +#include <omp.h>
>>> +
>>>   // KernelShark
>>>   #include "libkshark-plugin.h"
>>>   #include "KsGLWidget.hpp"
>>> @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs()
>>>           return graph;
>>>       };
>>>   +    omp_set_num_threads(omp_get_num_procs());
>> I think I already asked you to check if it is possible to move this to the constructor of the widget so that it is called just once. If there is some reason why this is not possible, at least provide some explanation.
> Hi Yordan, thanks for your review.
> 
> oops I missed that, will move it over.
>>
>>>       for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>>>           sd = it.key();
>>> +        QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
>>> +        QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
>>> +
>>>           /* Create CPU graphs according to the cpuList. */
>>>           it.value()._cpuGraphs = {};
>>> +        #pragma omp parallel for
>>>           for (auto const &cpu: it.value()._cpuList) {
>>> -            g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
>>> +            int idx = it.value()._cpuList.indexOf(cpu);
>>
>> Maybe I do not understand what you want to do here, but this looks over-complicated to me. Isn't it equivalent to having simply
>>
>>         for (size_t idx = 0; idx < nCpus; ++idx) {
>>              int cpu = it.value()._cpuList[idx];
>>
> I think _cpuList isn't exactly the same as [0..nCpus) here. In a default plot, some idle cpus may not be appended to the vector
> 
> 	/* Do not add plots for idle CPUs. */
> 	if (!kshark_hash_id_find(stream->idle_cpus, cpu))
> 		plotVec.append(cpu);
> 
> Also you can set a subset of CPUs to show, so _cpuList could be quite random. 
> 
Never mind, you're right. I got myself confused here. Will send you v3 soon. Thanks
> 
> Best,
> Libo
>> The same comment applies for the other loop below.
>>
>>> +            cpuGraphs[idx] = _newCPUGraph(sd, cpu);
>>> +        }
>>> +        QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
>>> +        while (itCpuGraphs.hasNext()) {
>>> +            g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>>>               it.value()._cpuGraphs.append(g);
>>>           }
>>>             /* Create Task graphs according to the taskList. */
>>>           it.value()._taskGraphs = {};
>>> +        #pragma omp parallel for
>>>           for (auto const &pid: it.value()._taskList) {
>>> -            g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
>>> +            int idx = it.value()._taskList.indexOf(pid);
>>> +            taskGraphs[idx] = _newTaskGraph(sd, pid);
>>> +        }
>>> +        QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
>>> +        while (itTaskGraphs.hasNext()) {
>>> +            g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
>>>               it.value()._taskGraphs.append(g);
>>>           }
>>> +
>> Please remove this empty line.
>>
>> Beside those minor things, the patch looks good to me. Please address the comments and I will be happy to apply your patch.
>>
>> Thanks for helping us improve KerrnelShark!
>>
>> Cheers,
>> Yordan
>>>       }
>>>         for (auto &c: _comboPlots) {
>>>           int n = c.count();
>>> +        #pragma omp parallel for
>>>           for (int i = 0; i < n; ++i) {
>>>               sd = c[i]._streamId;
>>>               if (c[i]._type & KSHARK_TASK_DRAW) {
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-03-26 23:46 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-14 22:07 [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots Libo Chen
2025-03-23 16:01 ` Yordan Karadzhov
2025-03-24 10:08   ` Libo Chen
2025-03-26 23:46     ` Libo Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).