[PATCH 1/2] IOzone test: Introduce postprocessing module

* [PATCH 1/2] IOzone test: Introduce postprocessing module
@ 2010-04-30 21:20 Lucas Meneghel Rodrigues
  2010-04-30 21:20 ` [PATCH 2/2] IOzone test: Introduce additional results postprocessing Lucas Meneghel Rodrigues
  2010-04-30 21:23 ` [Autotest] [PATCH 1/2] IOzone test: Introduce postprocessing module Martin Bligh
  0 siblings, 2 replies; 5+ messages in thread
From: Lucas Meneghel Rodrigues @ 2010-04-30 21:20 UTC (permalink / raw)
  To: autotest; +Cc: kvm

This module contains code to postprocess IOzone data
in a convenient way so we can generate performance graphs
and condensed data. The graph generation part depends
on gnuplot, but if the utility is not present,
functionality will gracefully degrade.

The reason why this was created as a separate module is:
 * It doesn't pollute the main test class.
 * Allows us to use the postprocess module as a stand alone program,
   that can even do performance comparison between 2 IOzone runs.

Signed-off-by: Lucas Meneghel Rodrigues <lmr@redhat.com>
---
 client/tests/iozone/postprocessing.py |  487 +++++++++++++++++++++++++++++++++
 1 files changed, 487 insertions(+), 0 deletions(-)
 create mode 100755 client/tests/iozone/postprocessing.py

diff --git a/client/tests/iozone/postprocessing.py b/client/tests/iozone/postprocessing.py
new file mode 100755
index 0000000..b495502
--- /dev/null
+++ b/client/tests/iozone/postprocessing.py
@@ -0,0 +1,487 @@
+#!/usr/bin/python
+"""
+Postprocessing module for IOzone. It is capable to pick results from an
+IOzone run, calculate the geometric mean for all throughput results for
+a given file size or record size, and then generate a series of 2D and 3D
+graphs. The graph generation functionality depends on gnuplot, and if it
+is not present, functionality degrates gracefully.
+
+@copyright: Red Hat 2010
+"""
+import os, sys, optparse, logging, math, time
+import common
+from autotest_lib.client.common_lib import logging_config, logging_manager
+from autotest_lib.client.common_lib import error
+from autotest_lib.client.bin import utils, os_dep
+
+
+_LABELS = ('file_size', 'record_size', 'write', 'rewrite', 'read', 'reread',
+           'randread', 'randwrite', 'bkwdread', 'recordrewrite', 'strideread',
+           'fwrite', 'frewrite', 'fread', 'freread')
+
+
+def unique(list):
+    """
+    Return a list of the elements in list, but without duplicates.
+
+    @param list: List with values.
+    @return: List with non duplicate elements.
+    """
+    n = len(list)
+    if n == 0:
+        return []
+    u = {}
+    try:
+        for x in list:
+            u[x] = 1
+    except TypeError:
+        return None
+    else:
+        return u.keys()
+
+
+def geometric_mean(values):
+    """
+    Evaluates the geometric mean for a list of numeric values.
+
+    @param values: List with values.
+    @return: Single value representing the geometric mean for the list values.
+    @see: http://en.wikipedia.org/wiki/Geometric_mean
+    """
+    try:
+        values = [int(value) for value in values]
+    except ValueError:
+        return None
+    product = 1
+    n = len(values)
+    if n == 0:
+        return None
+    return math.exp(sum([math.log(x) for x in values])/n)
+
+
+def compare_matrices(matrix1, matrix2, treshold=0.05):
+    """
+    Compare 2 matrices nxm and return a matrix nxm with comparison data
+
+    @param matrix1: Reference Matrix with numeric data
+    @param matrix2: Matrix that will be compared
+    @param treshold: Any difference bigger than this percent treshold will be
+            reported.
+    """
+    improvements = 0
+    regressions = 0
+    same = 0
+    comparison_matrix = []
+
+    new_matrix = []
+    for line1, line2 in zip(matrix1, matrix2):
+        new_line = []
+        for element1, element2 in zip(line1, line2):
+            ratio = float(element2) / float(element1)
+            if ratio < (1 - treshold):
+                regressions += 1
+                new_line.append((100 * ratio - 1) - 100)
+            elif ratio > (1 + treshold):
+                improvements += 1
+                new_line.append("+" + str((100 * ratio - 1) - 100))
+            else:
+                same + 1
+                if line1.index(element1) == 0:
+                    new_line.append(element1)
+                else:
+                    new_line.append(".")
+        new_matrix.append(new_line)
+
+    total = improvements + regressions + same
+
+    return (new_matrix, improvements, regressions, total)
+
+
+class IOzoneAnalyzer(object):
+    """
+    Analyze an unprocessed IOzone file, and generate the following types of
+    report:
+
+    * Summary of throughput for all file and record sizes combined
+    * Summary of throughput for all file sizes
+    * Summary of throughput for all record sizes
+
+    If more than one file is provided to the analyzer object, a comparison
+    between the two runs is made, searching for regressions in performance.
+    """
+    def __init__(self, list_files, output_dir):
+        self.list_files = list_files
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+        self.output_dir = output_dir
+        logging.info("Results will be stored in %s", output_dir)
+
+
+    def average_performance(self, results, size=None):
+        """
+        Flattens a list containing performance results.
+
+        @param results: List of n lists containing data from performance runs.
+        @param size: Numerical value of a size (say, file_size) that was used
+                to filter the original results list.
+        @return: List with 1 list containing average data from the performance
+                run.
+        """
+        average_line = []
+        if size is not None:
+            average_line.append(size)
+        for i in range(2, 15):
+            average = geometric_mean([line[i] for line in results]) / 1024.0
+            average = int(average)
+            average_line.append(average)
+        return average_line
+
+
+    def process_results(self, results, label=None):
+        """
+        Process a list of IOzone results according to label.
+
+        @label: IOzone column label that we'll use to filter and compute
+                geometric mean results, in practical term either 'file_size'
+                or 'record_size'.
+        @result: A list of n x m columns with original iozone results.
+        @return: A list of n-? x (m-1) columns with geometric averages for
+                values of each label (ex, average for all file_sizes).
+        """
+        performance = []
+        if label is not None:
+            index = _LABELS.index(label)
+            sizes = unique([line[index] for line in results])
+            sizes.sort()
+            for size in sizes:
+                r_results = [line for line in results if line[index] == size]
+                performance.append(self.average_performance(r_results, size))
+        else:
+            performance.append(self.average_performance(results))
+
+        return performance
+
+
+    def parse_file(self, file):
+        """
+        Parse an IOzone results file.
+
+        @param file: File object that will be parsed.
+        @return: Matrix containing IOzone results extracted from the file.
+        """
+        lines = []
+        for line in file.readlines():
+            fields = line.split()
+            if len(fields) != 15:
+                continue
+            try:
+                lines.append([int(i) for i in fields])
+            except ValueError:
+                continue
+        return lines
+
+
+    def report(self, overall_results, record_size_results, file_size_results):
+        """
+        Generates analysis data for IOZone run.
+
+        Generates a report to both logs (where it goes with nice headers) and
+        output files for further processing (graph generation).
+
+        @param overall_results: 1x15 Matrix containing IOzone results for all
+                file sizes
+        @param record_size_results: nx15 Matrix containing IOzone results for
+                each record size tested.
+        @param file_size_results: nx15 Matrix containing file size results
+                for each file size tested.
+        """
+        # Here we'll use the logging system to put the output of our analysis
+        # to files
+        logger = logging.getLogger()
+        formatter = logging.Formatter("")
+
+        logging.info("")
+        logging.info("TABLE:  SUMMARY of ALL FILE and RECORD SIZES                        Results in MB/sec")
+        logging.info("")
+        logging.info("FILE & RECORD  INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE")
+        logging.info("SIZES (KB)     WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("-------------------------------------------------------------------------------------------------------------------")
+        for result_line in overall_results:
+            logging.info("ALL            %-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s" % tuple(result_line))
+        logging.info("")
+
+        logging.info("DRILLED DATA:")
+
+        logging.info("")
+        logging.info("TABLE:  RECORD Size against all FILE Sizes                          Results in MB/sec")
+        logging.info("")
+        logging.info("RECORD    INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE ")
+        logging.info("SIZE (KB) WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("--------------------------------------------------------------------------------------------------------------")
+
+        foutput_path = os.path.join(self.output_dir, '2d-datasource-file')
+        if os.path.isfile(foutput_path):
+            os.unlink(foutput_path)
+        foutput = logging.FileHandler(foutput_path)
+        foutput.setFormatter(formatter)
+        logger.addHandler(foutput)
+        for result_line in record_size_results:
+            logging.info("%-10s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s" % tuple(result_line))
+        logger.removeHandler(foutput)
+
+        logging.info("")
+
+        logging.info("")
+        logging.info("TABLE:  FILE Size against all RECORD Sizes                          Results in MB/sec")
+        logging.info("")
+        logging.info("RECORD    INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE ")
+        logging.info("SIZE (KB) WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("--------------------------------------------------------------------------------------------------------------")
+
+        routput_path = os.path.join(self.output_dir, '2d-datasource-record')
+        if os.path.isfile(routput_path):
+            os.unlink(routput_path)
+        routput = logging.FileHandler(routput_path)
+        routput.setFormatter(formatter)
+        logger.addHandler(routput)
+        for result_line in file_size_results:
+            logging.info("%-10s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s" % tuple(result_line))
+        logger.removeHandler(routput)
+
+        logging.info("")
+
+
+    def report_comparison(self, record, file):
+        """
+        Generates comparison data for 2 IOZone runs.
+
+        It compares 2 sets of nxm results and outputs a table with differences.
+        If a difference higher or smaller than 5% is found, a warning is
+        triggered.
+
+        @param record: Tuple with 4 elements containing results for record size.
+        @param file: Tuple with 4 elements containing results for file size.
+        """
+        (record_size, record_improvements, record_regressions,
+         record_total) = record
+        (file_size, file_improvements, file_regressions,
+         file_total) = file
+        logging.info("ANALYSIS of DRILLED DATA:")
+
+        logging.info("")
+        logging.info("TABLE:  RECsize Difference between runs                            Results are % DIFF")
+        logging.info("")
+        logging.info("RECORD    INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE ")
+        logging.info("SIZE (KB) WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("--------------------------------------------------------------------------------------------------------------")
+        for result_line in record_size:
+            logging.info("%-10s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s" % tuple(result_line))
+        logging.info("REGRESSIONS: %d (%.2f%%)    Improvements: %d (%.2f%%)",
+                     record_regressions,
+                     (100 * record_regressions/float(record_total)),
+                     record_improvements,
+                     (100 * record_improvements/float(record_total)))
+        logging.info("")
+
+        logging.info("")
+        logging.info("TABLE:  FILEsize Difference between runs                           Results are % DIFF")
+        logging.info("")
+        logging.info("RECORD    INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE ")
+        logging.info("SIZE (KB) WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("--------------------------------------------------------------------------------------------------------------")
+        for result_line in file_size:
+            logging.info("%-10s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s" % tuple(result_line))
+        logging.info("REGRESSIONS: %d (%.2f%%)    Improvements: %d (%.2f%%)",
+                     file_regressions,
+                     (100 * file_regressions/float(file_total)),
+                     file_improvements,
+                     (100 * file_improvements/float(file_total)))
+        logging.info("")
+
+
+    def analyze(self):
+        """
+        Analyzes and eventually compares sets of IOzone data.
+        """
+        overall = []
+        record_size = []
+        file_size = []
+        for path in self.list_files:
+            file = open(path, 'r')
+            logging.info('FILE: %s', path)
+
+            results = self.parse_file(file)
+
+            overall_results = self.process_results(results)
+            record_size_results = self.process_results(results, 'record_size')
+            file_size_results = self.process_results(results, 'file_size')
+            self.report(overall_results, record_size_results, file_size_results)
+
+            if len(self.list_files) == 2:
+                overall.append(overall_results)
+                record_size.append(record_size_results)
+                file_size.append(file_size_results)
+
+        if len(self.list_files) == 2:
+            record_comparison = compare_matrices(*record_size)
+            file_comparison = compare_matrices(*file_size)
+            self.report_comparison(record_comparison, file_comparison)
+
+
+class IOzonePlotter(object):
+    """
+    Plots graphs based on the results of an IOzone run.
+
+    Plots graphs based on the results of an IOzone run. Uses gnuplot to
+    generate the graphs.
+    """
+    def __init__(self, results_file, output_dir):
+        self.active = True
+        try:
+            self.gnuplot = os_dep.command("gnuplot")
+        except:
+            logging.error("Command gnuplot not found, disabling graph "
+                          "generation")
+            self.active = False
+
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+        self.output_dir = output_dir
+
+        if not os.path.isfile(results_file):
+            logging.error("Invalid file %s provided, disabling graph "
+                          "generation", results_file)
+            self.active = False
+            self.results_file = None
+        else:
+            self.results_file = results_file
+            self.generate_data_source()
+
+
+    def generate_data_source(self):
+        """
+        Creates data file without headers for gnuplot consumption.
+        """
+        results_file = open(self.results_file, 'r')
+        self.datasource = os.path.join(self.output_dir, '3d-datasource')
+        datasource = open(self.datasource, 'w')
+        for line in results_file.readlines():
+            fields = line.split()
+            if len(fields) != 15:
+                continue
+            try:
+                values = [int(i) for i in fields]
+                datasource.write(line)
+            except ValueError:
+                continue
+        datasource.close()
+
+
+    def plot_2d_graphs(self):
+        """
+        For each one of the throughput parameters, generate a set of gnuplot
+        commands that will create a parametric surface with file size vs.
+        record size vs. throughput.
+        """
+        datasource_2d = os.path.join(self.output_dir, '2d-datasource-file')
+        for index, label in zip(range(1, 14), _LABELS[2:]):
+            commands_path = os.path.join(self.output_dir, '2d-%s.do' % label)
+            commands = ""
+            commands += "set title 'Iozone performance: %s'\n" % label
+            commands += "set logscale x\n"
+            commands += "set xlabel 'File size (KB)'\n"
+            commands += "set ylabel 'Througput (MB/s)'\n"
+            commands += "set terminal png small size 450 350\n"
+            commands += "set output '%s'\n" % os.path.join(self.output_dir,
+                                                           '2d-%s.png' % label)
+            commands += ("plot '%s' using 1:%s title '%s' with lines \n" %
+                         (datasource_2d, index, label))
+            commands_file = open(commands_path, 'w')
+            commands_file.write(commands)
+            commands_file.close()
+            try:
+                utils.run("%s %s" % (self.gnuplot, commands_path))
+            except error.CmdError, e:
+                logging.error("Problem plotting from commands file %s: %s",
+                              commands_file, str(e))
+
+
+    def plot_3d_graphs(self):
+        """
+        For each one of the throughput parameters, generate a set of gnuplot
+        commands that will create a parametric surface with file size vs.
+        record size vs. throughput.
+        """
+        for index, label in zip(range(1, 14), _LABELS[2:]):
+            commands_path = os.path.join(self.output_dir, '%s.do' % label)
+            commands = ""
+            commands += "set title 'Iozone performance: %s'\n" % label
+            commands += "set grid lt 2 lw 1\n"
+            commands += "set surface\n"
+            commands += "set parametric\n"
+            commands += "set xtics\n"
+            commands += "set ytics\n"
+            commands += "set logscale x 2\n"
+            commands += "set logscale y 2\n"
+            commands += "set logscale z\n"
+            commands += "set xrange [2.**5:2.**24]\n"
+            commands += "set xlabel 'File size (KB)'\n"
+            commands += "set ylabel 'Record size (KB)'\n"
+            commands += "set zlabel 'Througput (KB/s)'\n"
+            commands += "set data style lines\n"
+            commands += "set dgrid3d 80,80, 3\n"
+            commands += "set terminal png small size 900 700\n"
+            commands += "set output '%s'\n" % os.path.join(self.output_dir,
+                                                           '%s.png' % label)
+            commands += ("splot '%s' using 1:2:%s title '%s'\n" %
+                         (self.datasource, index, label))
+            commands_file = open(commands_path, 'w')
+            commands_file.write(commands)
+            commands_file.close()
+            try:
+                utils.run("%s %s" % (self.gnuplot, commands_path))
+            except error.CmdError, e:
+                logging.error("Problem plotting from commands file %s: %s",
+                              commands_file, str(e))
+
+
+    def plot_all(self):
+        """
+        Plot all graphs that are to be plotted, provided that we have gnuplot.
+        """
+        if self.active:
+            self.plot_2d_graphs()
+            self.plot_3d_graphs()
+
+
+class AnalyzerLoggingConfig(logging_config.LoggingConfig):
+    def configure_logging(self, results_dir=None, verbose=False):
+        super(AnalyzerLoggingConfig, self).configure_logging(use_console=True,
+                                                        verbose=verbose)
+
+
+if __name__ == "__main__":
+    parser = optparse.OptionParser("usage: %prog [options] [filenames]")
+    options, args = parser.parse_args()
+
+    logging_manager.configure_logging(AnalyzerLoggingConfig())
+
+    if args:
+        filenames = args
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+    if len(args) > 2:
+        parser.print_help()
+        sys.exit(1)
+
+    o = os.path.join(os.getcwd(),
+                     "iozone-graphs-%s" % time.strftime('%Y-%m-%d-%H.%M.%S'))
+    if not os.path.isdir(o):
+        os.makedirs(o)
+
+    a = IOzoneAnalyzer(list_files=filenames, output_dir=o)
+    a.analyze()
+    p = IOzonePlotter(results_file=filenames[0], output_dir=o)
+    p.plot_all()
-- 
1.7.0.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread