* [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots @ 2025-03-14 22:07 Libo Chen 2025-03-23 16:01 ` Yordan Karadzhov 0 siblings, 1 reply; 4+ messages in thread From: Libo Chen @ 2025-03-14 22:07 UTC (permalink / raw) To: y.karadz; +Cc: linux-trace-devel Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically speed up graph rendering particularly for traces from very large systems. OpenMP technically is a new dependency here, but it's part of GCC, so long as your GCC >= v4.9, the libgomp library will make the code compiled. Signed-off-by: Libo Chen <libo.chen@oracle.com> --- CMakeLists.txt | 6 ++++++ src/KsGLWidget.cpp | 25 +++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 988bfd6..7847177 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") +find_package(OpenMP 3.2.5) +if (OPENMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif(OPENMP_FOUND) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp index 9311d98..004d64b 100644 --- a/src/KsGLWidget.cpp +++ b/src/KsGLWidget.cpp @@ -13,6 +13,9 @@ #include <GL/glut.h> #include <GL/gl.h> +// OpenMP +#include <omp.h> + // KernelShark #include "libkshark-plugin.h" #include "KsGLWidget.hpp" @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs() return graph; }; + omp_set_num_threads(omp_get_num_procs()); for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) { sd = it.key(); + QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count()); + QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count()); + /* Create CPU graphs according to the cpuList. */ it.value()._cpuGraphs = {}; + #pragma omp parallel for for (auto const &cpu: it.value()._cpuList) { - g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing); + int idx = it.value()._cpuList.indexOf(cpu); + cpuGraphs[idx] = _newCPUGraph(sd, cpu); + } + QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs); + while (itCpuGraphs.hasNext()) { + g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing); it.value()._cpuGraphs.append(g); } /* Create Task graphs according to the taskList. */ it.value()._taskGraphs = {}; + #pragma omp parallel for for (auto const &pid: it.value()._taskList) { - g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing); + int idx = it.value()._taskList.indexOf(pid); + taskGraphs[idx] = _newTaskGraph(sd, pid); + } + QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs); + while (itTaskGraphs.hasNext()) { + g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing); it.value()._taskGraphs.append(g); } + } for (auto &c: _comboPlots) { int n = c.count(); + #pragma omp parallel for for (int i = 0; i < n; ++i) { sd = c[i]._streamId; if (c[i]._type & KSHARK_TASK_DRAW) { -- 2.43.5 ^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots 2025-03-14 22:07 [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots Libo Chen @ 2025-03-23 16:01 ` Yordan Karadzhov 2025-03-24 10:08 ` Libo Chen 0 siblings, 1 reply; 4+ messages in thread From: Yordan Karadzhov @ 2025-03-23 16:01 UTC (permalink / raw) To: Libo Chen; +Cc: linux-trace-devel Hi Libo, Please see my comments below. On 3/15/25 00:07, Libo Chen wrote: > Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically > speed up graph rendering particularly for traces from very large systems. > > OpenMP technically is a new dependency here, but it's part of GCC, so long > as your GCC >= v4.9, the libgomp library will make the code compiled. > > Signed-off-by: Libo Chen <libo.chen@oracle.com> > --- > CMakeLists.txt | 6 ++++++ > src/KsGLWidget.cpp | 25 +++++++++++++++++++++++-- > 2 files changed, 29 insertions(+), 2 deletions(-) > > diff --git a/CMakeLists.txt b/CMakeLists.txt > index 988bfd6..7847177 100644 > --- a/CMakeLists.txt > +++ b/CMakeLists.txt > @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin") > set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") > set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") > > +find_package(OpenMP 3.2.5) > +if (OPENMP_FOUND) > + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") > + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") > +endif(OPENMP_FOUND) > + > set(CMAKE_CXX_STANDARD 17) > set(CMAKE_CXX_STANDARD_REQUIRED ON) > set(CMAKE_CXX_EXTENSIONS OFF) > diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp > index 9311d98..004d64b 100644 > --- a/src/KsGLWidget.cpp > +++ b/src/KsGLWidget.cpp > @@ -13,6 +13,9 @@ > #include <GL/glut.h> > #include <GL/gl.h> > > +// OpenMP > +#include <omp.h> > + > // KernelShark > #include "libkshark-plugin.h" > #include "KsGLWidget.hpp" > @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs() > return graph; > }; > > + omp_set_num_threads(omp_get_num_procs()); I think I already asked you to check if it is possible to move this to the constructor of the widget so that it is called just once. If there is some reason why this is not possible, at least provide some explanation. > for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) { > sd = it.key(); > + QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count()); > + QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count()); > + > /* Create CPU graphs according to the cpuList. */ > it.value()._cpuGraphs = {}; > + #pragma omp parallel for > for (auto const &cpu: it.value()._cpuList) { > - g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing); > + int idx = it.value()._cpuList.indexOf(cpu); Maybe I do not understand what you want to do here, but this looks over-complicated to me. Isn't it equivalent to having simply for (size_t idx = 0; idx < nCpus; ++idx) { int cpu = it.value()._cpuList[idx]; The same comment applies for the other loop below. > + cpuGraphs[idx] = _newCPUGraph(sd, cpu); > + } > + QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs); > + while (itCpuGraphs.hasNext()) { > + g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing); > it.value()._cpuGraphs.append(g); > } > > /* Create Task graphs according to the taskList. */ > it.value()._taskGraphs = {}; > + #pragma omp parallel for > for (auto const &pid: it.value()._taskList) { > - g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing); > + int idx = it.value()._taskList.indexOf(pid); > + taskGraphs[idx] = _newTaskGraph(sd, pid); > + } > + QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs); > + while (itTaskGraphs.hasNext()) { > + g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing); > it.value()._taskGraphs.append(g); > } > + Please remove this empty line. Beside those minor things, the patch looks good to me. Please address the comments and I will be happy to apply your patch. Thanks for helping us improve KerrnelShark! Cheers, Yordan > } > > for (auto &c: _comboPlots) { > int n = c.count(); > + #pragma omp parallel for > for (int i = 0; i < n; ++i) { > sd = c[i]._streamId; > if (c[i]._type & KSHARK_TASK_DRAW) { ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots 2025-03-23 16:01 ` Yordan Karadzhov @ 2025-03-24 10:08 ` Libo Chen 2025-03-26 23:46 ` Libo Chen 0 siblings, 1 reply; 4+ messages in thread From: Libo Chen @ 2025-03-24 10:08 UTC (permalink / raw) To: Yordan Karadzhov; +Cc: linux-trace-devel On 3/23/25 09:01, Yordan Karadzhov wrote: > Hi Libo, > Please see my comments below. > > On 3/15/25 00:07, Libo Chen wrote: >> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically >> speed up graph rendering particularly for traces from very large systems. >> >> OpenMP technically is a new dependency here, but it's part of GCC, so long >> as your GCC >= v4.9, the libgomp library will make the code compiled. >> >> Signed-off-by: Libo Chen <libo.chen@oracle.com> >> --- >> CMakeLists.txt | 6 ++++++ >> src/KsGLWidget.cpp | 25 +++++++++++++++++++++++-- >> 2 files changed, 29 insertions(+), 2 deletions(-) >> >> diff --git a/CMakeLists.txt b/CMakeLists.txt >> index 988bfd6..7847177 100644 >> --- a/CMakeLists.txt >> +++ b/CMakeLists.txt >> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin") >> set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") >> set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") >> +find_package(OpenMP 3.2.5) >> +if (OPENMP_FOUND) >> + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") >> + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") >> +endif(OPENMP_FOUND) >> + >> set(CMAKE_CXX_STANDARD 17) >> set(CMAKE_CXX_STANDARD_REQUIRED ON) >> set(CMAKE_CXX_EXTENSIONS OFF) >> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp >> index 9311d98..004d64b 100644 >> --- a/src/KsGLWidget.cpp >> +++ b/src/KsGLWidget.cpp >> @@ -13,6 +13,9 @@ >> #include <GL/glut.h> >> #include <GL/gl.h> >> +// OpenMP >> +#include <omp.h> >> + >> // KernelShark >> #include "libkshark-plugin.h" >> #include "KsGLWidget.hpp" >> @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs() >> return graph; >> }; >> + omp_set_num_threads(omp_get_num_procs()); > I think I already asked you to check if it is possible to move this to the constructor of the widget so that it is called just once. If there is some reason why this is not possible, at least provide some explanation. Hi Yordan, thanks for your review. oops I missed that, will move it over. > >> for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) { >> sd = it.key(); >> + QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count()); >> + QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count()); >> + >> /* Create CPU graphs according to the cpuList. */ >> it.value()._cpuGraphs = {}; >> + #pragma omp parallel for >> for (auto const &cpu: it.value()._cpuList) { >> - g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing); >> + int idx = it.value()._cpuList.indexOf(cpu); > > Maybe I do not understand what you want to do here, but this looks over-complicated to me. Isn't it equivalent to having simply > > for (size_t idx = 0; idx < nCpus; ++idx) { > int cpu = it.value()._cpuList[idx]; > I think _cpuList isn't exactly the same as [0..nCpus) here. In a default plot, some idle cpus may not be appended to the vector /* Do not add plots for idle CPUs. */ if (!kshark_hash_id_find(stream->idle_cpus, cpu)) plotVec.append(cpu); Also you can set a subset of CPUs to show, so _cpuList could be quite random. Best, Libo > The same comment applies for the other loop below. > >> + cpuGraphs[idx] = _newCPUGraph(sd, cpu); >> + } >> + QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs); >> + while (itCpuGraphs.hasNext()) { >> + g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing); >> it.value()._cpuGraphs.append(g); >> } >> /* Create Task graphs according to the taskList. */ >> it.value()._taskGraphs = {}; >> + #pragma omp parallel for >> for (auto const &pid: it.value()._taskList) { >> - g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing); >> + int idx = it.value()._taskList.indexOf(pid); >> + taskGraphs[idx] = _newTaskGraph(sd, pid); >> + } >> + QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs); >> + while (itTaskGraphs.hasNext()) { >> + g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing); >> it.value()._taskGraphs.append(g); >> } >> + > Please remove this empty line. > > Beside those minor things, the patch looks good to me. Please address the comments and I will be happy to apply your patch. > > Thanks for helping us improve KerrnelShark! > > Cheers, > Yordan >> } >> for (auto &c: _comboPlots) { >> int n = c.count(); >> + #pragma omp parallel for >> for (int i = 0; i < n; ++i) { >> sd = c[i]._streamId; >> if (c[i]._type & KSHARK_TASK_DRAW) { ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots 2025-03-24 10:08 ` Libo Chen @ 2025-03-26 23:46 ` Libo Chen 0 siblings, 0 replies; 4+ messages in thread From: Libo Chen @ 2025-03-26 23:46 UTC (permalink / raw) To: Yordan Karadzhov; +Cc: linux-trace-devel On 3/24/25 03:08, Libo Chen wrote: > > > On 3/23/25 09:01, Yordan Karadzhov wrote: >> Hi Libo, >> Please see my comments below. >> >> On 3/15/25 00:07, Libo Chen wrote: >>> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically >>> speed up graph rendering particularly for traces from very large systems. >>> >>> OpenMP technically is a new dependency here, but it's part of GCC, so long >>> as your GCC >= v4.9, the libgomp library will make the code compiled. >>> >>> Signed-off-by: Libo Chen <libo.chen@oracle.com> >>> --- >>> CMakeLists.txt | 6 ++++++ >>> src/KsGLWidget.cpp | 25 +++++++++++++++++++++++-- >>> 2 files changed, 29 insertions(+), 2 deletions(-) >>> >>> diff --git a/CMakeLists.txt b/CMakeLists.txt >>> index 988bfd6..7847177 100644 >>> --- a/CMakeLists.txt >>> +++ b/CMakeLists.txt >>> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin") >>> set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") >>> set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") >>> +find_package(OpenMP 3.2.5) >>> +if (OPENMP_FOUND) >>> + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") >>> + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") >>> +endif(OPENMP_FOUND) >>> + >>> set(CMAKE_CXX_STANDARD 17) >>> set(CMAKE_CXX_STANDARD_REQUIRED ON) >>> set(CMAKE_CXX_EXTENSIONS OFF) >>> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp >>> index 9311d98..004d64b 100644 >>> --- a/src/KsGLWidget.cpp >>> +++ b/src/KsGLWidget.cpp >>> @@ -13,6 +13,9 @@ >>> #include <GL/glut.h> >>> #include <GL/gl.h> >>> +// OpenMP >>> +#include <omp.h> >>> + >>> // KernelShark >>> #include "libkshark-plugin.h" >>> #include "KsGLWidget.hpp" >>> @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs() >>> return graph; >>> }; >>> + omp_set_num_threads(omp_get_num_procs()); >> I think I already asked you to check if it is possible to move this to the constructor of the widget so that it is called just once. If there is some reason why this is not possible, at least provide some explanation. > Hi Yordan, thanks for your review. > > oops I missed that, will move it over. >> >>> for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) { >>> sd = it.key(); >>> + QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count()); >>> + QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count()); >>> + >>> /* Create CPU graphs according to the cpuList. */ >>> it.value()._cpuGraphs = {}; >>> + #pragma omp parallel for >>> for (auto const &cpu: it.value()._cpuList) { >>> - g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing); >>> + int idx = it.value()._cpuList.indexOf(cpu); >> >> Maybe I do not understand what you want to do here, but this looks over-complicated to me. Isn't it equivalent to having simply >> >> for (size_t idx = 0; idx < nCpus; ++idx) { >> int cpu = it.value()._cpuList[idx]; >> > I think _cpuList isn't exactly the same as [0..nCpus) here. In a default plot, some idle cpus may not be appended to the vector > > /* Do not add plots for idle CPUs. */ > if (!kshark_hash_id_find(stream->idle_cpus, cpu)) > plotVec.append(cpu); > > Also you can set a subset of CPUs to show, so _cpuList could be quite random. > Never mind, you're right. I got myself confused here. Will send you v3 soon. Thanks > > Best, > Libo >> The same comment applies for the other loop below. >> >>> + cpuGraphs[idx] = _newCPUGraph(sd, cpu); >>> + } >>> + QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs); >>> + while (itCpuGraphs.hasNext()) { >>> + g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing); >>> it.value()._cpuGraphs.append(g); >>> } >>> /* Create Task graphs according to the taskList. */ >>> it.value()._taskGraphs = {}; >>> + #pragma omp parallel for >>> for (auto const &pid: it.value()._taskList) { >>> - g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing); >>> + int idx = it.value()._taskList.indexOf(pid); >>> + taskGraphs[idx] = _newTaskGraph(sd, pid); >>> + } >>> + QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs); >>> + while (itTaskGraphs.hasNext()) { >>> + g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing); >>> it.value()._taskGraphs.append(g); >>> } >>> + >> Please remove this empty line. >> >> Beside those minor things, the patch looks good to me. Please address the comments and I will be happy to apply your patch. >> >> Thanks for helping us improve KerrnelShark! >> >> Cheers, >> Yordan >>> } >>> for (auto &c: _comboPlots) { >>> int n = c.count(); >>> + #pragma omp parallel for >>> for (int i = 0; i < n; ++i) { >>> sd = c[i]._streamId; >>> if (c[i]._type & KSHARK_TASK_DRAW) { > ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-03-26 23:46 UTC | newest] Thread overview: 4+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2025-03-14 22:07 [PATCH RESEND v2] kernel-shark: Multi-thread the computaion of stream/combo plots Libo Chen 2025-03-23 16:01 ` Yordan Karadzhov 2025-03-24 10:08 ` Libo Chen 2025-03-26 23:46 ` Libo Chen
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).