From f29fef6b673596b004875a6e3822febbde565ce1 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 26 Jul 2018 06:43:46 +0200
Subject: [PATCH 01/20] [benchmark] Print delim in verbose config

---
 benchmark/utils/DriverUtils.swift  |  1 +
 test/benchmark/Benchmark_O.test.md | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/benchmark/utils/DriverUtils.swift b/benchmark/utils/DriverUtils.swift
index 9e9b0e067507a..53f6ea027d058 100644
--- a/benchmark/utils/DriverUtils.swift
+++ b/benchmark/utils/DriverUtils.swift
@@ -160,6 +160,7 @@ struct TestConfig {
             LogMemory: \(logMemory)
             SampleTime: \(sampleTime)
             FixedIters: \(fixedNumIters)
+            Delimiter: \(String(reflecting: delim))
             Tests Filter: \(c.tests ?? [])
             Tests to run: \(testList)
 
diff --git a/test/benchmark/Benchmark_O.test.md b/test/benchmark/Benchmark_O.test.md
index 3e07a76aba807..ac8419eb54e94 100644
--- a/test/benchmark/Benchmark_O.test.md
+++ b/test/benchmark/Benchmark_O.test.md
@@ -143,6 +143,17 @@ LOGMEMORY: ,{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}}
 LOGVERBOSE-LABEL: Running AngryPhonebook for 2 samples.
 ````
 
+Verify the specified delimiter is used when logging to console. The non-verbose
+variant of this invocation is used from [`Benchmark_Driver`][BD] to get the list
+of all tests. That's why it is *crucial* to tests this integration point.
+
+````
+RUN: %Benchmark_O --list --skip-tags= --delim=$'\t' --verbose \
+RUN:              | %FileCheck %s --check-prefix LOGVERBOSEDELIM
+LOGVERBOSEDELIM: Delimiter: "\t"
+LOGVERBOSEDELIM: #	Test	[Tags]
+````
+
 ## Error Handling
 
 ````

From 179b12103f2c5f84620d7fdefe0a19be30c0d9e0 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 9 Aug 2018 07:51:54 +0200
Subject: [PATCH 02/20] [benchmark] Refactor formatting responsibilities

Moved result formatting methods from `PerformanceTestResult` and `ResultComparison` to `ReportFormatter`, in order to free PTR to take more computational responsibilities in the future.
---
 benchmark/scripts/compare_perf_tests.py      | 113 +++++++++++--------
 benchmark/scripts/test_compare_perf_tests.py |  83 +++++++-------
 2 files changed, 108 insertions(+), 88 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 2865df59bd116..2f09f63da2e5e 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -98,25 +98,15 @@ def push(x):
         values = [r.min, r.max, r.median][:min(r.samples, 3)]
         map(push, values)
 
-    # Column labels for header row in results table
-    header = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
-
-    def values(self):
-        """Values property for display in results table comparisons
-        in format: ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS').
-        """
-        return (
-            self.name,
-            str(self.min), str(self.max), str(int(self.mean)),
-            str(self.max_rss) if self.max_rss else '—'
-        )
-
 
 class ResultComparison(object):
     """ResultComparison compares MINs from new and old PerformanceTestResult.
+
     It computes speedup ratio and improvement delta (%).
     """
+
     def __init__(self, old, new):
+        """Initialize with old and new `PerformanceTestResult`s to compare."""
         self.old = old
         self.new = new
         assert(old.name == new.name)
@@ -129,25 +119,10 @@ def __init__(self, old, new):
         ratio = (new.min + 0.001) / (old.min + 0.001)
         self.delta = ((ratio - 1) * 100)
 
-        # Add ' (?)' to the speedup column as indication of dubious changes:
-        # result's MIN falls inside the (MIN, MAX) interval of result they are
-        # being compared with.
-        self.is_dubious = (
-            ' (?)' if ((old.min < new.min and new.min < old.max) or
-                       (new.min < old.min and old.min < new.max))
-            else '')
-
-    # Column labels for header row in results table
-    header = ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP')
-
-    def values(self):
-        """Values property for display in results table comparisons
-        in format: ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP').
-        """
-        return (self.name,
-                str(self.old.min), str(self.new.min),
-                '{0:+.1f}%'.format(self.delta),
-                '{0:.2f}x{1}'.format(self.ratio, self.is_dubious))
+        # Indication of dubious changes: when result's MIN falls inside the
+        # (MIN, MAX) interval of result they are being compared with.
+        self.is_dubious = ((old.min < new.min and new.min < old.max) or
+                           (new.min < old.min and old.min < new.max))
 
 
 class TestComparator(object):
@@ -215,13 +190,17 @@ def partition(l, p):
 
 
 class ReportFormatter(object):
-    """ReportFormatter formats the `PerformanceTestResult`s and
-    `ResultComparison`s provided by `TestComparator` using their `header` and
-    `values()` into report table. Supported formats are: `markdown` (used for
-    displaying benchmark results on GitHub), `git` and `html`.
+    """Creates the report from perfromance test comparison in specified format.
+
+    `ReportFormatter` formats the `PerformanceTestResult`s and
+    `ResultComparison`s provided by `TestComparator` into report table.
+    Supported formats are: `markdown` (used for displaying benchmark results on
+    GitHub), `git` and `html`.
     """
+
     def __init__(self, comparator, old_branch, new_branch, changes_only,
                  single_table=False):
+        """Initialize with `TestComparator` and names of branches."""
         self.comparator = comparator
         self.old_branch = old_branch
         self.new_branch = new_branch
@@ -237,13 +216,45 @@ def __init__(self, comparator, old_branch, new_branch, changes_only,
     GIT_DETAIL = """
 {0} ({1}): {2}"""
 
+    PERFORMANCE_TEST_RESULT_HEADER = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
+    RESULT_COMPARISON_HEADER = ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP')
+
+    @staticmethod
+    def header_for(result):
+        """Column labels for header row in results table."""
+        return (ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER
+                if isinstance(result, PerformanceTestResult) else
+                # isinstance(result, ResultComparison)
+                ReportFormatter.RESULT_COMPARISON_HEADER)
+
+    @staticmethod
+    def values(result):
+        """Format values from PerformanceTestResult or ResultComparison.
+
+        Returns tuple of strings to display in the results table.
+        """
+        return (
+            (result.name,
+             str(result.min), str(result.max), str(int(result.mean)),
+             str(result.max_rss) if result.max_rss else '—')
+            if isinstance(result, PerformanceTestResult) else
+            # isinstance(result, ResultComparison)
+            (result.name,
+             str(result.old.min), str(result.new.min),
+             '{0:+.1f}%'.format(result.delta),
+             '{0:.2f}x{1}'.format(result.ratio,
+                                  ' (?)' if result.is_dubious else ''))
+        )
+
     def markdown(self):
+        """Report results of benchmark comparisons in Markdown format."""
         return self._formatted_text(
             ROW='{0} | {1} | {2} | {3} | {4} \n',
             HEADER_SEPARATOR='---',
             DETAIL=self.MARKDOWN_DETAIL)
 
     def git(self):
+        """Report results of benchmark comparisons in 'git' format."""
         return self._formatted_text(
             ROW='{0}   {1}   {2}   {3}   {4} \n',
             HEADER_SEPARATOR='   ',
@@ -251,14 +262,15 @@ def git(self):
 
     def _column_widths(self):
         changed = self.comparator.decreased + self.comparator.increased
-        comparisons = (changed if self.changes_only else
-                       changed + self.comparator.unchanged)
-        comparisons += self.comparator.added + self.comparator.removed
+        results = (changed if self.changes_only else
+                   changed + self.comparator.unchanged)
+        results += self.comparator.added + self.comparator.removed
 
         widths = [
             map(len, columns) for columns in
-            [PerformanceTestResult.header, ResultComparison.header] +
-            [c.values() for c in comparisons]
+            [ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER,
+             ReportFormatter.RESULT_COMPARISON_HEADER] +
+            [ReportFormatter.values(r) for r in results]
         ]
 
         def max_widths(maximum, widths):
@@ -285,8 +297,8 @@ def format_columns(r, strong):
 
         def table(title, results, is_strong=False, is_open=False):
             rows = [
-                row(format_columns(result_comparison.values(), is_strong))
-                for result_comparison in results
+                row(format_columns(ReportFormatter.values(r), is_strong))
+                for r in results
             ]
             if not rows:
                 return ''
@@ -294,7 +306,7 @@ def table(title, results, is_strong=False, is_open=False):
             if self.single_table:
                 t = ''
                 if not self.header_printed:
-                    t += header(results[0].header)
+                    t += header(ReportFormatter.header_for(results[0]))
                     self.header_printed = True
                 t += row(('**' + title + '**', '', '', '', ''))
                 t += ''.join(rows)
@@ -303,7 +315,8 @@ def table(title, results, is_strong=False, is_open=False):
             return DETAIL.format(
                 *[
                     title, len(results),
-                    (header(results[0].header) + ''.join(rows)),
+                    (header(ReportFormatter.header_for(results[0])) +
+                     ''.join(rows)),
                     ('open' if is_open else '')
                 ])
 
@@ -363,7 +376,7 @@ def table(title, results, is_strong=False, is_open=False):
 """
 
     def html(self):
-
+        """Report results of benchmark comparisons in HTML format."""
         def row(name, old, new, delta, speedup, speedup_color):
             return self.HTML_ROW.format(
                 name, old, new, delta, speedup_color, speedup)
@@ -373,11 +386,12 @@ def header(contents):
 
         def table(title, results, speedup_color):
             rows = [
-                row(*(result_comparison.values() + (speedup_color,)))
-                for result_comparison in results
+                row(*(ReportFormatter.values(r) + (speedup_color,)))
+                for r in results
             ]
             return ('' if not rows else
-                    header((title, len(results)) + results[0].header[1:]) +
+                    header((title, len(results)) +
+                           ReportFormatter.header_for(results[0])[1:]) +
                     ''.join(rows))
 
         return self.HTML.format(
@@ -423,6 +437,7 @@ def parse_args(args):
 
 
 def main():
+    """Compare benchmarks for changes in a formatted report."""
     args = parse_args(sys.argv[1:])
     comparator = TestComparator(args.old_file, args.new_file,
                                 args.delta_threshold)
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index fc7f4e4454e56..350515c8c5933 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -63,24 +63,6 @@ def test_repr(self):
             'min:10664 max:12933 mean:11035 sd:576.0 median:10884>'
         )
 
-    def test_header(self):
-        self.assertEquals(PerformanceTestResult.header,
-                          ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS'))
-
-    def test_values(self):
-        log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
-        r = PerformanceTestResult(log_line.split(','))
-        self.assertEquals(
-            r.values(),
-            ('AngryPhonebook', '10664', '12933', '11035', '—')
-        )
-        log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
-        r = PerformanceTestResult(log_line.split(','))
-        self.assertEquals(
-            r.values(),
-            ('AngryPhonebook', '12045', '12045', '12045', '10510336')
-        )
-
     def test_merge(self):
         tests = """1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336
 1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
@@ -142,31 +124,13 @@ def test_init(self):
             ResultComparison, self.r0, self.r1
         )
 
-    def test_header(self):
-        self.assertEquals(ResultComparison.header,
-                          ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP'))
-
-    def test_values(self):
-        rc = ResultComparison(self.r1, self.r2)
-        self.assertEquals(
-            rc.values(),
-            ('AngryPhonebook', '12325', '11616', '-5.8%', '1.06x')
-        )
-        # other way around
-        rc = ResultComparison(self.r2, self.r1)
-        self.assertEquals(
-            rc.values(),
-            ('AngryPhonebook', '11616', '12325', '+6.1%', '0.94x')
-        )
-
     def test_values_is_dubious(self):
+        self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious)
         self.r2.max = self.r1.min + 1
         # new.min < old.min < new.max
-        rc = ResultComparison(self.r1, self.r2)
-        self.assertEquals(rc.values()[4], '1.06x (?)')
+        self.assertTrue(ResultComparison(self.r1, self.r2).is_dubious)
         # other way around: old.min < new.min < old.max
-        rc = ResultComparison(self.r2, self.r1)
-        self.assertEquals(rc.values()[4], '0.94x (?)')
+        self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious)
 
 
 class OldAndNewLog(unittest.TestCase):
@@ -266,6 +230,37 @@ def assert_git_contains(self, texts):
     def assert_html_contains(self, texts):
         self.assert_report_contains(texts, self.html)
 
+    def test_values(self):
+        self.assertEquals(
+            ReportFormatter.values(PerformanceTestResult(
+                '1,AngryPhonebook,20,10664,12933,11035,576,10884'.split(','))),
+            ('AngryPhonebook', '10664', '12933', '11035', '—')
+        )
+        self.assertEquals(
+            ReportFormatter.values(PerformanceTestResult(
+                '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
+                .split(','))),
+            ('AngryPhonebook', '12045', '12045', '12045', '10510336')
+        )
+
+        r1 = PerformanceTestResult(
+            '1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336'.split(','))
+        r2 = PerformanceTestResult(
+            '1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144'.split(','))
+        self.assertEquals(
+            ReportFormatter.values(ResultComparison(r1, r2)),
+            ('AngryPhonebook', '12325', '11616', '-5.8%', '1.06x')
+        )
+        self.assertEquals(
+            ReportFormatter.values(ResultComparison(r2, r1)),
+            ('AngryPhonebook', '11616', '12325', '+6.1%', '0.94x')
+        )
+        r2.max = r1.min + 1
+        self.assertEquals(
+            ReportFormatter.values(ResultComparison(r1, r2))[4],
+            '1.06x (?)'  # is_dubious
+        )
+
     def test_justified_columns(self):
         """Table columns are all formated with same width, defined by the
         longest value.
@@ -281,6 +276,16 @@ def test_column_headers(self):
         """Report contains table headers for ResultComparisons and changed
         PerformanceTestResults.
         """
+        performance_test_result = self.tc.added[0]
+        self.assertEquals(
+            ReportFormatter.header_for(performance_test_result),
+            ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
+        )
+        comparison_result = self.tc.increased[0]
+        self.assertEquals(
+            ReportFormatter.header_for(comparison_result),
+            ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP')
+        )
         self.assert_markdown_contains([
             'TEST                  | OLD    | NEW    | DELTA   | SPEEDUP',
             '---                   | ---    | ---    | ---     | ---    ',

From 343f284227ba8b151f8aee39f750a4ac172fb790 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 16 Aug 2018 19:46:58 +0200
Subject: [PATCH 03/20] [benchmark] Removed legacy submit command

Also removed inused imports.
---
 benchmark/scripts/Benchmark_Driver | 117 -----------------------------
 1 file changed, 117 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index 92f12c627855a..c5a502f86917a 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -14,71 +14,16 @@
 # ===---------------------------------------------------------------------===//
 
 import argparse
-import datetime
 import glob
-import json
 import os
 import re
 import subprocess
 import sys
 import time
-import urllib
-import urllib2
 
 DRIVER_DIR = os.path.dirname(os.path.realpath(__file__))
 
 
-def parse_results(res, optset):
-    # Parse lines like this
-    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),PEAK_MEMORY(B)
-    score_re = re.compile(r"(\d+),[ \t]*(\w+)," +
-                          ",".join([r"[ \t]*([\d.]+)"] * 7))
-    # The Totals line would be parsed like this.
-    total_re = re.compile(r"()(Totals)," +
-                          ",".join([r"[ \t]*([\d.]+)"] * 7))
-    key_group = 2
-    val_group = 4
-    mem_group = 9
-
-    tests = []
-    for line in res.split():
-        m = score_re.match(line)
-        if not m:
-            m = total_re.match(line)
-            if not m:
-                continue
-        testresult = int(m.group(val_group))
-        testname = m.group(key_group)
-        test = {}
-        test['Data'] = [testresult]
-        test['Info'] = {}
-        test['Name'] = "nts.swift/" + optset + "." + testname + ".exec"
-        tests.append(test)
-        if testname != 'Totals':
-            mem_testresult = int(m.group(mem_group))
-            mem_test = {}
-            mem_test['Data'] = [mem_testresult]
-            mem_test['Info'] = {}
-            mem_test['Name'] = "nts.swift/mem_maxrss." + \
-                optset + "." + testname + ".mem"
-            tests.append(mem_test)
-    return tests
-
-
-def submit_to_lnt(data, url):
-    print("\nSubmitting results to LNT server...")
-    json_report = {'input_data': json.dumps(data), 'commit': '1'}
-    data = urllib.urlencode(json_report)
-    response_str = urllib2.urlopen(urllib2.Request(url, data))
-    response = json.loads(response_str.read())
-    if 'success' in response:
-        print("Server response:\tSuccess")
-    else:
-        print("Server response:\tError")
-        print("Error:\t", response['error'])
-        sys.exit(1)
-
-
 def instrument_test(driver_path, test, num_samples):
     """Run a test and instrument its peak memory use"""
     test_outputs = []
@@ -242,43 +187,6 @@ def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False,
     return formatted_output
 
 
-def submit(args):
-    print("SVN revision:\t", args.revision)
-    print("Machine name:\t", args.machine)
-    print("Iterations:\t", args.iterations)
-    print("Optimizations:\t", ','.join(args.optimization))
-    print("LNT host:\t", args.lnt_host)
-    starttime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
-    print("Start time:\t", starttime)
-    data = {}
-    data['Tests'] = []
-    data['Machine'] = {'Info': {'name': args.machine}, 'Name': args.machine}
-    print("\nRunning benchmarks...")
-    for optset in args.optimization:
-        print("Opt level:\t", optset)
-        file = os.path.join(args.tests, "Benchmark_" + optset)
-        try:
-            res = run_benchmarks(
-                file, benchmarks=get_tests(file, args),
-                num_samples=args.iterations)
-            data['Tests'].extend(parse_results(res, optset))
-        except subprocess.CalledProcessError as e:
-            print("Execution failed.. Test results are empty.")
-            print("Process output:\n", e.output)
-
-    endtime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
-    data['Run'] = {'End Time': endtime,
-                   'Info': {'inferred_run_order': str(args.revision),
-                            'run_order': str(args.revision),
-                            'tag': 'nts',
-                            'test_suite_revision': 'None'},
-                   'Start Time': starttime}
-    print("End time:\t", endtime)
-
-    submit_to_lnt(data, args.lnt_host)
-    return 0
-
-
 def run(args):
     optset = args.optimization
     file = os.path.join(args.tests, "Benchmark_" + optset)
@@ -416,29 +324,6 @@ def main():
         '(default: DRIVER_DIR)',
         default=DRIVER_DIR)
 
-    submit_parser = subparsers.add_parser(
-        'submit',
-        help='Run benchmarks and submit results to LNT',
-        parents=[parent_parser])
-    submit_parser.add_argument(
-        '-o', '--optimization', nargs='+',
-        help='optimization levels to use (default: O Onone Osize)',
-        default=['O', 'Onone', 'Osize'])
-    submit_parser.add_argument(
-        '-i', '--iterations',
-        help='number of times to run each test (default: 10)',
-        type=positive_int, default=10)
-    submit_parser.add_argument(
-        '-m', '--machine', required=True,
-        help='LNT machine name')
-    submit_parser.add_argument(
-        '-r', '--revision', required=True,
-        help='SVN revision of compiler to identify the LNT run', type=int)
-    submit_parser.add_argument(
-        '-l', '--lnt_host', required=True,
-        help='LNT host to submit results to')
-    submit_parser.set_defaults(func=submit)
-
     run_parser = subparsers.add_parser(
         'run',
         help='Run benchmarks and output results to stdout',
@@ -480,8 +365,6 @@ def main():
     compare_parser.set_defaults(func=compare)
 
     args = parser.parse_args()
-    if args.func != compare and isinstance(args.optimization, list):
-        args.optimization = sorted(list(set(args.optimization)))
     return args.func(args)
 
 

From 0b990a82a5fc2c8bbd1d149976aaf702a4b3a3db Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Tue, 7 Aug 2018 22:15:52 +0200
Subject: [PATCH 04/20] [benchmark] Extracted test_utils.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moving the `captured_output` function to own file.

Adding homegrown unit testing helper classes `Stub` and `Mock`.

The issue is that the unittest.mock was added in Python 3.3 and we need to run on Python 2.7. `Stub` and `Mock` were organically developed as minimal implementations to support the common testing patterns used on the original branch, but since I’m rewriting the commit history to provide an easily digestible narrative, it makes sense to introduce them here in one step as a custom unit testing library.
---
 benchmark/scripts/test_compare_perf_tests.py | 14 +---
 benchmark/scripts/test_utils.py              | 88 ++++++++++++++++++++
 2 files changed, 89 insertions(+), 13 deletions(-)
 create mode 100644 benchmark/scripts/test_utils.py

diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 350515c8c5933..c92b9a8e90448 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -19,9 +19,6 @@
 import tempfile
 import unittest
 
-from StringIO import StringIO
-from contextlib import contextmanager
-
 from compare_perf_tests import PerformanceTestResult
 from compare_perf_tests import ReportFormatter
 from compare_perf_tests import ResultComparison
@@ -29,16 +26,7 @@
 from compare_perf_tests import main
 from compare_perf_tests import parse_args
 
-
-@contextmanager
-def captured_output():
-    new_out, new_err = StringIO(), StringIO()
-    old_out, old_err = sys.stdout, sys.stderr
-    try:
-        sys.stdout, sys.stderr = new_out, new_err
-        yield sys.stdout, sys.stderr
-    finally:
-        sys.stdout, sys.stderr = old_out, old_err
+from test_utils import captured_output
 
 
 class TestPerformanceTestResult(unittest.TestCase):
diff --git a/benchmark/scripts/test_utils.py b/benchmark/scripts/test_utils.py
new file mode 100644
index 0000000000000..7112ca37ba3f1
--- /dev/null
+++ b/benchmark/scripts/test_utils.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ===--- test_utils.py ---------------------------------------------------===//
+#
+#  This source file is part of the Swift.org open source project
+#
+#  Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
+#  Licensed under Apache License v2.0 with Runtime Library Exception
+#
+#  See https://swift.org/LICENSE.txt for license information
+#  See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+#
+# ===---------------------------------------------------------------------===//
+"""
+Homegrown unit testing helpers.
+
+The issue is that the unittest.mock was added in Python 3.3 and we need to run
+on Python 2.7.
+
+`Stub` and `Mock` are very rudimentary and support only the limited subset of
+common unit testing patterns that is used in this project.
+"""
+
+import sys
+
+from StringIO import StringIO
+from contextlib import contextmanager
+
+
+@contextmanager
+def captured_output():
+    """Capture stdout and stderr and return their output as string buffers."""
+    new_out, new_err = StringIO(), StringIO()
+    old_out, old_err = sys.stdout, sys.stderr
+    try:
+        sys.stdout, sys.stderr = new_out, new_err
+        yield sys.stdout, sys.stderr
+    finally:
+        sys.stdout, sys.stderr = old_out, old_err
+
+
+class Stub:
+    """Object literal stub for value objects."""
+
+    def __init__(self, **attributes):
+        """All named parameters will create properties on this object."""
+        self.__dict__.update(attributes)
+
+
+class Mock(object):
+    """Minimal infrastructure for manually mocking calls to single method.
+
+    Classes inheriting from Mock are expected to create their own mock of the
+    tested method with appropriate signature, which appends the call arguments
+    as tuple to the `calls` list and returns the canned response retrieved from
+    the `respond` dictionary.
+    """
+
+    def __init__(self, responses=None):
+        """Optionaly initialized with a list of expected calls. See expect."""
+        self.calls = []
+        self.expected = []
+        self.respond = dict()
+        responses = responses or []
+        for call_args, response in responses:
+            self.expect(call_args, response)
+
+    def expect(self, call_args, response):
+        """Expect invocation of tested method with given arguments.
+
+        Stores the canned reponse in the `respond` dictionary.
+        """
+        call_args = tuple(call_args)
+        self.expected.append(call_args)
+        self.respond[call_args] = response
+
+    def assert_called_with(self, expected_args):
+        """Verify that the tested method was called with provided arguments."""
+        expected_args = tuple(expected_args)
+        assert expected_args in self.calls, (
+            'Expected: {0} in Called: {1}'.format(expected_args, self.calls))
+
+    def assert_called_all_expected(self):
+        """Verify that all expeced invocations of tested method were called."""
+        assert self.calls == self.expected, (
+            '\nExpected: {0}, \n  Called: {1}'.format(
+                self.expected, self.calls))

From 69d5d5e73252e4e5367e6d8c9c6d00b4da2dd542 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 16 Aug 2018 19:50:42 +0200
Subject: [PATCH 05/20] [benchmark] Adding tests for BenchmarkDriver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The imports are a bit sketchy because it doesn’t have `.py` extension and they had to be hacked manually. :-/

Extracted `parse_args` from `main` and added test coverage for argument parsing.
---
 .gitignore                                 |  1 +
 benchmark/scripts/Benchmark_Driver         | 33 ++++----
 benchmark/scripts/test_Benchmark_Driver.py | 93 ++++++++++++++++++++++
 3 files changed, 113 insertions(+), 14 deletions(-)
 create mode 100644 benchmark/scripts/test_Benchmark_Driver.py

diff --git a/.gitignore b/.gitignore
index 74aa31bc37df7..05e03732a5215 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,3 +53,4 @@ compile_commands.json
 SortedCFDatabase.def
 htmlcov
 .coverage
+/benchmark/scripts/Benchmark_Driverc
diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index c5a502f86917a..c23f617d0d3ca 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -300,16 +300,17 @@ def positive_int(value):
     return ivalue
 
 
-def main():
+def parse_args(args):
+    """Parse command line arguments and set default values."""
     parser = argparse.ArgumentParser(
         epilog='Example: ./Benchmark_Driver run -i 5 -f Prefix -f .*Suffix.*'
     )
     subparsers = parser.add_subparsers(
         title='Swift benchmark driver commands',
-        help='See COMMAND -h for additional arguments', metavar='<command>')
+        help='See COMMAND -h for additional arguments', metavar='COMMAND')
 
-    parent_parser = argparse.ArgumentParser(add_help=False)
-    benchmarks_group = parent_parser.add_mutually_exclusive_group()
+    shared_benchmarks_parser = argparse.ArgumentParser(add_help=False)
+    benchmarks_group = shared_benchmarks_parser.add_mutually_exclusive_group()
     benchmarks_group.add_argument(
         'benchmarks',
         default=[],
@@ -318,22 +319,22 @@ def main():
         '-f', '--filter', dest='filters', action='append',
         help='run all tests whose name match regular expression PATTERN, ' +
         'multiple filters are supported', metavar="PATTERN")
-    parent_parser.add_argument(
+    shared_benchmarks_parser.add_argument(
         '-t', '--tests',
         help='directory containing Benchmark_O{,none,size} ' +
         '(default: DRIVER_DIR)',
         default=DRIVER_DIR)
-
-    run_parser = subparsers.add_parser(
-        'run',
-        help='Run benchmarks and output results to stdout',
-        parents=[parent_parser])
-    run_parser.add_argument(
+    shared_benchmarks_parser.add_argument(
         '-o', '--optimization',
         metavar='OPT',
         choices=['O', 'Onone', 'Osize'],
         help='optimization level to use: {O,Onone,Osize}, (default: O)',
         default='O')
+
+    run_parser = subparsers.add_parser(
+        'run',
+        help='Run benchmarks and output results to stdout',
+        parents=[shared_benchmarks_parser])
     run_parser.add_argument(
         '-i', '--iterations',
         help='number of times to run each test (default: 1)',
@@ -343,7 +344,7 @@ def main():
         help='log results to directory (default: no logging)')
     run_parser.add_argument(
         '--swift-repo',
-        help='absolute path to Swift source repo for branch comparison')
+        help='absolute path to the Swift source repository')
     run_parser.set_defaults(func=run)
 
     compare_parser = subparsers.add_parser(
@@ -354,7 +355,7 @@ def main():
         help='directory containing benchmark logs')
     compare_parser.add_argument(
         '--swift-repo', required=True,
-        help='absolute path to Swift source repo')
+        help='absolute path to the Swift source repository')
     compare_parser.add_argument(
         '--compare-script', required=True,
         help='absolute path to compare script')
@@ -364,7 +365,11 @@ def main():
              'branch (default: master)')
     compare_parser.set_defaults(func=compare)
 
-    args = parser.parse_args()
+    return parser.parse_args(args)
+
+
+def main():
+    args = parse_args(sys.argv[1:])
     return args.func(args)
 
 
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
new file mode 100644
index 0000000000000..26b66b126ce89
--- /dev/null
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ===--- test_Benchmark_Driver.py ----------------------------------------===//
+#
+#  This source file is part of the Swift.org open source project
+#
+#  Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
+#  Licensed under Apache License v2.0 with Runtime Library Exception
+#
+#  See https://swift.org/LICENSE.txt for license information
+#  See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+#
+# ===---------------------------------------------------------------------===//
+
+import os
+import unittest
+
+from imp import load_source
+
+from test_utils import captured_output
+# import Benchmark_Driver  # doesn't work because it misses '.py' extension
+Benchmark_Driver = load_source(
+    'Benchmark_Driver', os.path.join(os.path.dirname(
+        os.path.abspath(__file__)), 'Benchmark_Driver'))
+# from Benchmark_Driver import parse_args
+parse_args = Benchmark_Driver.parse_args
+
+
+class Test_parse_args(unittest.TestCase):
+    def assert_contains(self, texts, output):
+        assert not isinstance(texts, str)
+        for text in texts:
+            self.assertIn(text, output)
+
+    def test_requires_command_argument(self):
+        with captured_output() as (_, err):
+            self.assertRaises(SystemExit, parse_args, [])
+        self.assert_contains(['usage:', 'COMMAND', 'too few arguments'],
+                             err.getvalue())
+
+    def test_command_help_lists_commands(self):
+        with captured_output() as (out, _):
+            self.assertRaises(SystemExit, parse_args, ['-h'])
+        self.assert_contains(['COMMAND', 'run', 'compare'],
+                             out.getvalue())
+
+    def test_run_benchmarks_by_name_or_ordinal(self):
+        benchmarks = ['AngryPhonebook', '42']
+        self.assertEquals(
+            parse_args(['run'] + benchmarks).benchmarks, benchmarks)
+
+    def test_run_benchmarks_matching_pattern(self):
+        regexes = ['Prefix', '.*Suffix.*']
+        filters = ['-f', regexes[0], '-f', regexes[1]]
+        self.assertEquals(parse_args(['run'] + filters).filters, regexes)
+
+    def test_run_benchmarks_and_filters_are_exclusive(self):
+        with captured_output() as (_, err):
+            self.assertRaises(SystemExit,
+                              parse_args, 'run -f Filter1 Benchmark1'.split())
+        self.assert_contains(
+            ['error',
+             'argument BENCHMARK: not allowed with argument -f/--filter'],
+            err.getvalue())
+
+    def test_tests_location(self):
+        here = os.path.dirname(os.path.abspath(__file__))
+        self.assertEquals(parse_args(['run']).tests, here)
+        tests = '/benchmarks/are/here'
+        self.assertEquals(parse_args(['run', '-t', tests]).tests, tests)
+
+    def test_optimization_argument(self):
+        self.assertEquals(parse_args(['run']).optimization, 'O')
+        self.assertEquals(
+            parse_args(['run', '-o', 'O']).optimization, 'O')
+        self.assertEquals(
+            parse_args(['run', '-o', 'Onone']).optimization, 'Onone')
+        self.assertEquals(
+            parse_args(['run', '-o', 'Osize']).optimization, 'Osize')
+
+        with captured_output() as (_, err):
+            self.assertRaises(SystemExit,
+                              parse_args, ['run', '-o', 'bogus'])
+        self.assert_contains(
+            ['error:',
+             "argument -o/--optimization: invalid choice: 'bogus'",
+             "(choose from 'O', 'Onone', 'Osize')"],
+            err.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()

From ce39b129296f27d1b720919d22cb836d84fe07d9 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 16 Aug 2018 19:53:22 +0200
Subject: [PATCH 06/20] [benchmark] Strangler: BenchmarkDriver `get_tests`

See https://www.martinfowler.com/bliki/StranglerApplication.html for more info on the used pattern for refactoring legacy applications.

Introduced class `BenchmarkDriver` as a beginning of strangler application that will gradually replace old functions. Used it instead of `get_tests()` function in Benchmark_Driver.

The interaction with Benchmark_O is simulated through mocking. `SubprocessMock` class records the invocations of command line processes and responds with canned replies in the format of Benchmark_O output.

Removed 3 redundant lit tests that are now covered by the unit test `test_gets_list_of_all_benchmarks_when_benchmarks_args_exist`. This saves 3 seconds from test execution. Keeping only single integration test that verifies that the plumbing is connected correstly.
---
 benchmark/scripts/Benchmark_Driver         | 86 ++++++++++++++--------
 benchmark/scripts/test_Benchmark_Driver.py | 85 ++++++++++++++++++++-
 test/benchmark/Benchmark_Driver.test-sh    | 11 ---
 3 files changed, 139 insertions(+), 43 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index c23f617d0d3ca..bb9d47968dc47 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -24,6 +24,59 @@ import time
 DRIVER_DIR = os.path.dirname(os.path.realpath(__file__))
 
 
+class BenchmarkDriver(object):
+    def __init__(self, args, tests=None, _subprocess=None):
+        self.args = args
+        self._subprocess = _subprocess or subprocess
+        self.tests = tests or self._get_tests()
+
+    def _invoke(self, cmd):
+        return self._subprocess.check_output(
+            cmd, stderr=self._subprocess.STDOUT)
+
+    @property
+    def test_harness(self):
+        return os.path.join(self.args.tests,
+                            "Benchmark_" + self.args.optimization)
+
+    @property
+    def _cmd_list_benchmarks(self):
+        # Use tab delimiter for easier parsing to override the default comma.
+        # (The third 'column' is always comma-separated list of tags in square
+        # brackets -- currently unused here.)
+        return [self.test_harness, '--list', '--delim=\t'] + (
+            ['--skip-tags='] if (self.args.benchmarks or
+                                 self.args.filters) else [])
+
+    def _get_tests(self):
+        """Return a list of performance tests to run."""
+        index_name_pairs = [
+            line.split('\t')[:2] for line in
+            self._invoke(self._cmd_list_benchmarks).split('\n')[1:-1]
+        ]
+        # unzip list of pairs into 2 lists
+        indices, self.all_tests = map(list, zip(*index_name_pairs))
+        if self.args.filters:
+            return self._tests_matching_patterns()
+        if self.args.benchmarks:
+            return self._tests_by_name_or_index(indices)
+        return self.all_tests
+
+    def _tests_matching_patterns(self):
+        regexes = [re.compile(pattern) for pattern in self.args.filters]
+        return sorted(list(set([name for pattern in regexes
+                                for name in self.all_tests
+                                if pattern.match(name)])))
+
+    def _tests_by_name_or_index(self, indices):
+        benchmarks = set(self.args.benchmarks)
+        index_to_name = dict(zip(indices, self.all_tests))
+        indexed_names = [index_to_name[i]
+                         for i in benchmarks.intersection(set(indices))]
+        return sorted(list(
+            benchmarks.intersection(set(self.all_tests)).union(indexed_names)))
+
+
 def instrument_test(driver_path, test, num_samples):
     """Run a test and instrument its peak memory use"""
     test_outputs = []
@@ -63,34 +116,6 @@ def instrument_test(driver_path, test, num_samples):
     return avg_test_output
 
 
-def get_tests(driver_path, args):
-    """Return a list of available performance tests"""
-    driver = ([driver_path, '--list'])
-    # Use tab delimiter for easier parsing to override the default comma.
-    # (The third 'column' is always comma-separated list of tags in square
-    # brackets -- currently unused here.)
-    driver.append('--delim=\t')
-    if args.benchmarks or args.filters:
-        driver.append('--skip-tags=')  # list all tests, don't skip any tags
-    index_name_pairs = [
-        line.split('\t')[:2] for line in
-        subprocess.check_output(driver).split('\n')[1:-1]
-    ]
-    indices, names = zip(*index_name_pairs)  # unzip list of pairs into 2 lists
-    if args.filters:
-        regexes = [re.compile(pattern) for pattern in args.filters]
-        return sorted(list(set([name for pattern in regexes
-                                for name in names if pattern.match(name)])))
-    if not args.benchmarks:
-        return names
-    benchmarks = set(args.benchmarks)
-    index_to_name = dict(index_name_pairs)
-    indexed_names = [index_to_name[i]
-                     for i in benchmarks.intersection(set(indices))]
-    return sorted(list(
-        benchmarks.intersection(set(names)).union(indexed_names)))
-
-
 def get_current_git_branch(git_repo_path):
     """Return the selected branch for the repo `git_repo_path`"""
     return subprocess.check_output(
@@ -188,10 +213,9 @@ def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False,
 
 
 def run(args):
-    optset = args.optimization
-    file = os.path.join(args.tests, "Benchmark_" + optset)
+    driver = BenchmarkDriver(args)
     run_benchmarks(
-        file, benchmarks=get_tests(file, args),
+        driver.test_harness, benchmarks=driver.tests,
         num_samples=args.iterations, verbose=True,
         log_directory=args.output_dir,
         swift_repo=args.swift_repo)
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 26b66b126ce89..a8af2b5bf23b5 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -18,13 +18,15 @@
 
 from imp import load_source
 
-from test_utils import captured_output
+from test_utils import Mock, captured_output
+
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
 Benchmark_Driver = load_source(
     'Benchmark_Driver', os.path.join(os.path.dirname(
         os.path.abspath(__file__)), 'Benchmark_Driver'))
 # from Benchmark_Driver import parse_args
 parse_args = Benchmark_Driver.parse_args
+BenchmarkDriver = Benchmark_Driver.BenchmarkDriver
 
 
 class Test_parse_args(unittest.TestCase):
@@ -89,5 +91,86 @@ def test_optimization_argument(self):
             err.getvalue())
 
 
+class ArgsStub(object):
+    def __init__(self):
+        self.benchmarks = None
+        self.filters = None
+        self.tests = '/benchmarks/'
+        self.optimization = 'O'
+
+
+class SubprocessMock(Mock):
+    """Mock for subprocess module's `check_output` method."""
+    STDOUT = object()
+
+    def __init__(self, responses=None):
+        super(SubprocessMock, self).__init__()
+
+        def _check_output(args, stdin=None, stdout=None, stderr=None,
+                          shell=False):
+            return self.record_and_respond(args, stdin, stdout, stderr, shell)
+        self.check_output = _check_output
+
+    def record_and_respond(self, args, stdin, stdout, stderr, shell):
+        # _ = stdin, stdout, shell  # ignored in mock
+        assert stderr == self.STDOUT, 'Errors are NOT redirected to STDOUT'
+        args = tuple(args)
+        self.calls.append(args)
+        return self.respond.get(args, '')
+
+
+class TestBenchmarkDriverInitialization(unittest.TestCase):
+    def setUp(self):
+        self.args = ArgsStub()
+        self.subprocess_mock = SubprocessMock()
+
+    def test_test_harness(self):
+        self.assertEquals(
+            BenchmarkDriver(self.args, tests=['ignored']).test_harness,
+            '/benchmarks/Benchmark_O')
+        self.args.tests = '/path'
+        self.args.optimization = 'Suffix'
+        self.assertEquals(
+            BenchmarkDriver(self.args, tests=['ignored']).test_harness,
+            '/path/Benchmark_Suffix')
+
+    def test_gets_list_of_precommit_benchmarks(self):
+        self.subprocess_mock.expect(
+            '/benchmarks/Benchmark_O --list --delim=\t'.split(' '),
+            '#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n1\tBenchmark2\t[t3]\n')
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
+        self.subprocess_mock.assert_called_all_expected()
+        self.assertEquals(driver.tests,
+                          ['Benchmark1', 'Benchmark2'])
+        self.assertEquals(driver.all_tests,
+                          ['Benchmark1', 'Benchmark2'])
+
+    list_all_tests = (
+        '/benchmarks/Benchmark_O --list --delim=\t --skip-tags='.split(' '),
+        """#	Test	[Tags]
+1	Benchmark1	[t1, t2]
+2	Benchmark2	[t3]
+3	Benchmark3	[t3, t4]
+""")
+
+    def test_gets_list_of_all_benchmarks_when_benchmarks_args_exist(self):
+        """Filters tests by name or test number, ignoring unknown."""
+        self.args.benchmarks = '1 Benchmark3 1 bogus'.split()
+        self.subprocess_mock.expect(*self.list_all_tests)
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
+        self.subprocess_mock.assert_called_all_expected()
+        self.assertEquals(driver.tests, ['Benchmark1', 'Benchmark3'])
+
+    def test_filters_benchmarks_by_pattern(self):
+        self.args.filters = '-f .+3'.split()
+        self.subprocess_mock.expect(*self.list_all_tests)
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
+        self.subprocess_mock.assert_called_all_expected()
+        self.assertEquals(driver.tests, ['Benchmark3'])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/benchmark/Benchmark_Driver.test-sh b/test/benchmark/Benchmark_Driver.test-sh
index dec24a369da2b..ad7559c6147da 100644
--- a/test/benchmark/Benchmark_Driver.test-sh
+++ b/test/benchmark/Benchmark_Driver.test-sh
@@ -4,19 +4,8 @@
 // REQUIRES: CMAKE_GENERATOR=Ninja
 
 // Integration tests between Benchmark_Driver and Benchmark_O
-// TODO: Keep the "run just once" check and move the rest into unit tests for
-//       Benchmark_Driver, as they are redundant and unnecessarily slow.
-
-// RUN: %Benchmark_Driver run Ackermann | %FileCheck %s --check-prefix RUNNAMED
-// RUNNAMED: Ackermann
-
-// RUN: %Benchmark_Driver run 1 | %FileCheck %s --check-prefix RUNBYNUMBER
-// RUNBYNUMBER: Ackermann
 
 // RUN: %Benchmark_Driver run 1 Ackermann 1 \
 // RUN:                 | %FileCheck %s --check-prefix RUNJUSTONCE
 // RUNJUSTONCE-LABEL: Ackermann
 // RUNJUSTONCE-NOT: Ackermann
-
-// RUN: %Benchmark_Driver run -f Acker | %FileCheck %s --check-prefix RUNFILTER
-// RUNFILTER: Ackermann

From d079607488cd5792b327c5b7a15bc0e73708b2b7 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Mon, 13 Aug 2018 15:19:58 +0200
Subject: [PATCH 07/20] [benchmark] Documentation improvements

---
 benchmark/scripts/compare_perf_tests.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 2f09f63da2e5e..535c7f2f743d1 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -12,7 +12,11 @@
 #  See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 #
 # ===---------------------------------------------------------------------===//
+"""
+This script is used for comparing performance test results.
 
+It is structured into several classes that can be imported into other modules.
+"""
 from __future__ import print_function
 
 import argparse
@@ -22,9 +26,10 @@
 
 
 class PerformanceTestResult(object):
-    """PerformanceTestResult holds results from executing an individual
-    benchmark from the Swift Benchmark Suite as reported by the test driver
-    (Benchmark_O, Benchmark_Onone, Benchmark_Osize or Benchmark_Driver).
+    u"""Result from executing an individual Swift Benchmark Suite benchmark.
+
+    Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
+    or Benchmark_Driver).
 
     It depends on the log format emitted by the test driver in the form:
     #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
@@ -33,9 +38,11 @@ class PerformanceTestResult(object):
     Benchmark_Driver to measure rough memory use during the execution of the
     benchmark.
     """
+
     def __init__(self, csv_row):
-        """PerformanceTestResult instance is created from an iterable with
-        length of 8 or 9. (Like a row provided by the CSV parser.)
+        """Initialized from a row with 8 or 9 columns with benchmark summary.
+
+        The row is an iterable, such as a row provided by the CSV parser.
         """
         # csv_row[0] is just an ordinal number of the test - skip that
         self.name = csv_row[1]          # Name of the performance test
@@ -52,6 +59,7 @@ def __init__(self, csv_row):
             int(csv_row[8]) if len(csv_row) > 8 else None)
 
     def __repr__(self):
+        """Short summary for debugging purposes."""
         return (
             '<PerformanceTestResult name:{0.name!r} '
             'samples:{0.samples!r} min:{0.min!r} max:{0.max!r} '

From 9852e9a32ac404960693110d16e5410b4e354bbf Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Mon, 13 Aug 2018 15:28:41 +0200
Subject: [PATCH 08/20] [benchmark] Extracted LogParser class

---
 benchmark/scripts/compare_perf_tests.py      |  52 ++++++----
 benchmark/scripts/test_compare_perf_tests.py | 103 ++++++++++++-------
 2 files changed, 98 insertions(+), 57 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 535c7f2f743d1..b445b25c6f284 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -133,35 +133,42 @@ def __init__(self, old, new):
                            (new.min < old.min and old.min < new.max))
 
 
+class LogParser(object):
+    @staticmethod
+    def load_from_csv(filename):  # handles output from Benchmark_O and
+        def skip_totals(row):     # Benchmark_Driver (added MAX_RSS column)
+            return len(row) > 7 and row[0].isdigit()
+        tests = map(PerformanceTestResult,
+                    filter(skip_totals, csv.reader(open(filename))))
+
+        def add_or_merge(names, r):
+            if r.name not in names:
+                names[r.name] = r
+            else:
+                names[r.name].merge(r)
+            return names
+        return reduce(add_or_merge, tests, dict())
+
 class TestComparator(object):
-    """TestComparator parses `PerformanceTestResult`s from CSV log files.
-    Then it determines which tests were `added`, `removed` and which can be
+    """Analyzes changes betweeen the old and new test results.
+
+    It determines which tests were `added`, `removed` and which can be
     compared. It then splits the `ResultComparison`s into 3 groups according to
     the `delta_threshold` by the change in performance: `increased`,
-    `descreased` and `unchanged`.
+    `descreased` and `unchanged`. Whole computaion is performed during
+    initialization and results are provided as properties on this object.
 
     The lists of `added`, `removed` and `unchanged` tests are sorted
     alphabetically. The `increased` and `decreased` lists are sorted in
     descending order by the amount of change.
     """
-    def __init__(self, old_file, new_file, delta_threshold):
-
-        def load_from_CSV(filename):  # handles output from Benchmark_O and
-            def skip_totals(row):     # Benchmark_Driver (added MAX_RSS column)
-                return len(row) > 7 and row[0].isdigit()
-            tests = map(PerformanceTestResult,
-                        filter(skip_totals, csv.reader(open(filename))))
-
-            def add_or_merge(names, r):
-                if r.name not in names:
-                    names[r.name] = r
-                else:
-                    names[r.name].merge(r)
-                return names
-            return reduce(add_or_merge, tests, dict())
-
-        old_results = load_from_CSV(old_file)
-        new_results = load_from_CSV(new_file)
+
+    def __init__(self, old_results, new_results, delta_threshold):
+        """Initialized with dictionaries of old and new benchmark results.
+
+        Dictionary keys are benchmark names, values are
+        `PerformanceTestResult`s.
+        """
         old_tests = set(old_results.keys())
         new_tests = set(new_results.keys())
         comparable_tests = new_tests.intersection(old_tests)
@@ -447,7 +454,8 @@ def parse_args(args):
 def main():
     """Compare benchmarks for changes in a formatted report."""
     args = parse_args(sys.argv[1:])
-    comparator = TestComparator(args.old_file, args.new_file,
+    comparator = TestComparator(LogParser.load_from_csv(args.old_file),
+                                LogParser.load_from_csv(args.new_file),
                                 args.delta_threshold)
     formatter = ReportFormatter(comparator, args.old_branch, args.new_branch,
                                 args.changes_only, args.single_table)
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index c92b9a8e90448..93fe0053fdf26 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -19,6 +19,7 @@
 import tempfile
 import unittest
 
+from compare_perf_tests import LogParser
 from compare_perf_tests import PerformanceTestResult
 from compare_perf_tests import ReportFormatter
 from compare_perf_tests import ResultComparison
@@ -121,38 +122,11 @@ def test_values_is_dubious(self):
         self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious)
 
 
-class OldAndNewLog(unittest.TestCase):
+class FileSystemIntegration(unittest.TestCase):
     def setUp(self):
         # Create a temporary directory
         self.test_dir = tempfile.mkdtemp()
 
-        self.old_log = self.write_temp_file('old.log', """
-1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365
-2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445
-3,Array2D,20,335831,400221,346622,0,346622,28297216
-4,ArrayAppend,20,23641,29000,24990,0,24990,11149926
-34,BitCount,20,3,4,4,0,4,10192896
-35,ByteSwap,20,4,6,4,0,4,10185933
-
-Totals,269,67351871,70727022,68220188,0,0,0
-""")
-        # Log lines are deliberately in descending sort order in order to test
-        # the ordering of the comparison results.
-        # ArrayAppend is included twice to test results merging.
-        self.new_log = self.write_temp_file('new.log', """
-265,TwoSum,20,5006,5679,5111,0,5111
-35,ByteSwap,20,0,0,0,0,0
-34,BitCount,20,9,9,9,0,9
-4,ArrayAppend,20,23641,29000,24990,0,24990
-3,Array2D,20,335831,400221,346622,0,346622
-1,AngryPhonebook,20,10458,12714,11000,0,11000
-
-Totals,269,67351871,70727022,68220188,0,0,0
-4,ArrayAppend,1,20000,20000,20000,0,20000
-
-Totals,1,20000,20000,20000,0,0,0
-""")
-
     def tearDown(self):
         # Remove the directory after the test
         shutil.rmtree(self.test_dir)
@@ -163,27 +137,81 @@ def write_temp_file(self, file_name, data):
             f.write(data)
         return temp_file_name
 
+
+class OldAndNewLog(unittest.TestCase):
+    old_log_content = """1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365
+2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445
+3,Array2D,20,335831,400221,346622,0,346622,28297216
+4,ArrayAppend,20,23641,29000,24990,0,24990,11149926
+34,BitCount,20,3,4,4,0,4,10192896
+35,ByteSwap,20,4,6,4,0,4,10185933"""
+
+    new_log_content = """265,TwoSum,20,5006,5679,5111,0,5111
+35,ByteSwap,20,0,0,0,0,0
+34,BitCount,20,9,9,9,0,9
+4,ArrayAppend,20,20000,29000,24990,0,24990
+3,Array2D,20,335831,400221,346622,0,346622
+1,AngryPhonebook,20,10458,12714,11000,0,11000"""
+
+    old_results = dict([(r.name, r)
+                        for r in
+                        map(PerformanceTestResult,
+                            [line.split(',')
+                             for line in
+                             old_log_content.splitlines()])])
+
+    new_results = dict([(r.name, r)
+                        for r in
+                        map(PerformanceTestResult,
+                            [line.split(',')
+                             for line in
+                             new_log_content.splitlines()])])
+
     def assert_report_contains(self, texts, report):
         assert not isinstance(texts, str)
         for text in texts:
             self.assertIn(text, report)
 
 
+class TestLogParser(FileSystemIntegration):
+    def test_load_from_csv(self):
+        """Ignores header row, empty lines and Totals row."""
+        log_file = self.write_temp_file('log.log',"""
+#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
+34,BitCount,20,3,4,4,0,4
+
+Totals,269
+""")
+        results = LogParser.load_from_csv(log_file)
+        self.assertEquals(results.keys(), ['BitCount'])
+        self.assertTrue(isinstance(results['BitCount'], PerformanceTestResult))
+
+    def test_merge(self):
+        concatenated_logs = self.write_temp_file('concat.log', """
+4,ArrayAppend,20,23641,29000,24990,0,24990
+4,ArrayAppend,1,20000,20000,20000,0,20000
+""")
+        results = LogParser.load_from_csv(concatenated_logs)
+        self.assertEquals(results.keys(), ['ArrayAppend'])
+        result = results['ArrayAppend']
+        self.assertTrue(isinstance(result, PerformanceTestResult))
+        self.assertEquals(result.min, 20000)
+        self.assertEquals(result.max, 29000)
+
+
 class TestTestComparator(OldAndNewLog):
     def test_init(self):
-        old_log, new_log = self.old_log, self.new_log
-
         def names(tests):
             return [t.name for t in tests]
 
-        tc = TestComparator(old_log, new_log, 0.05)
+        tc = TestComparator(self.old_results, self.new_results, 0.05)
         self.assertEquals(names(tc.unchanged), ['AngryPhonebook', 'Array2D'])
         self.assertEquals(names(tc.increased), ['ByteSwap', 'ArrayAppend'])
         self.assertEquals(names(tc.decreased), ['BitCount'])
         self.assertEquals(names(tc.added), ['TwoSum'])
         self.assertEquals(names(tc.removed), ['AnyHashableWithAClass'])
         # other way around
-        tc = TestComparator(new_log, old_log, 0.05)
+        tc = TestComparator(self.new_results, self.old_results, 0.05)
         self.assertEquals(names(tc.unchanged), ['AngryPhonebook', 'Array2D'])
         self.assertEquals(names(tc.increased), ['BitCount'])
         self.assertEquals(names(tc.decreased), ['ByteSwap', 'ArrayAppend'])
@@ -191,7 +219,7 @@ def names(tests):
         self.assertEquals(names(tc.removed), ['TwoSum'])
         # delta_threshold determines the sorting into change groups;
         # report only change above 100% (ByteSwap's runtime went to 0):
-        tc = TestComparator(old_log, new_log, 1)
+        tc = TestComparator(self.old_results, self.new_results, 1)
         self.assertEquals(
             names(tc.unchanged),
             ['AngryPhonebook', 'Array2D', 'ArrayAppend', 'BitCount']
@@ -203,7 +231,7 @@ def names(tests):
 class TestReportFormatter(OldAndNewLog):
     def setUp(self):
         super(TestReportFormatter, self).setUp()
-        self.tc = TestComparator(self.old_log, self.new_log, 0.05)
+        self.tc = TestComparator(self.old_results, self.new_results, 0.05)
         self.rf = ReportFormatter(self.tc, '', '', changes_only=False)
         self.markdown = self.rf.markdown()
         self.git = self.rf.git()
@@ -445,7 +473,7 @@ def test_branch_arguments(self):
         self.assertEquals(args.new_branch, 'amazing-optimization')
 
 
-class Test_compare_perf_tests_main(OldAndNewLog):
+class Test_compare_perf_tests_main(OldAndNewLog, FileSystemIntegration):
     """Integration test that invokes the whole comparison script."""
     markdown = [
         '<summary>Regression (1)</summary>',
@@ -459,6 +487,11 @@ class Test_compare_perf_tests_main(OldAndNewLog):
     ]
     html = ['<html>', "<td align='left'>BitCount</td>"]
 
+    def setUp(self):
+        super(Test_compare_perf_tests_main, self).setUp()
+        self.old_log = self.write_temp_file('old.log', self.old_log_content)
+        self.new_log = self.write_temp_file('new.log', self.new_log_content)
+
     def execute_main_with_format(self, report_format, test_output=False):
         report_file = self.test_dir + 'report.log'
         args = ['compare_perf_tests.py',

From d0cdaee798f8b7ca23564c26feb884801d437f33 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Mon, 13 Aug 2018 17:28:41 +0200
Subject: [PATCH 09/20] [benchmark] LogParser support for --verbose mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LogParser doesn’t use `csv.reader` anymore.
Parsing is handled by a Finite State Machine. Each line is matched against a set of (mutually exclusive) regular expressions that represent known states. When a match is found, corresponding parsing action is taken.
---
 benchmark/scripts/compare_perf_tests.py      | 88 ++++++++++++++++++--
 benchmark/scripts/test_compare_perf_tests.py | 69 +++++++++++----
 2 files changed, 135 insertions(+), 22 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index b445b25c6f284..b563cb059d6b8 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -20,7 +20,7 @@
 from __future__ import print_function
 
 import argparse
-import csv
+import re
 import sys
 from math import sqrt
 
@@ -57,6 +57,8 @@ def __init__(self, csv_row):
         self.median = int(csv_row[7])   # Median runtime (ms)
         self.max_rss = (                # Maximum Resident Set Size (B)
             int(csv_row[8]) if len(csv_row) > 8 else None)
+        # Sample lists for statistical analysis of measured results
+        self.all_samples = None
 
     def __repr__(self):
         """Short summary for debugging purposes."""
@@ -134,12 +136,63 @@ def __init__(self, old, new):
 
 
 class LogParser(object):
+    """Converts log outputs into `PerformanceTestResult`s.
+
+    Supports various formats produced by the `Benchmark_Driver` and
+    `Benchmark_O`('Onone', 'Osize'). It can also merge together the
+    results from concatenated log files.
+    """
+
+    def __init__(self):
+        """Create instance of `LogParser`."""
+        self.results, self.samples, self.num_iters = [], [], 1
+
+    # Parse lines like this
+    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
+    results_re = re.compile(r'(\d+,[ \t]*\w+,' +
+                            ','.join([r'[ \t]*[\d.]+'] * 6) + ')')
+
+    def _append_result(self, result):
+        r = PerformanceTestResult(result.split(','))
+        if self.samples:
+            r.all_samples = self.samples
+        self.results.append(r)
+        self.num_iters, self.samples = 1, []
+
+    # Regular expression and action to take when it matches the parsed line
+    state_actions = {
+        results_re: _append_result,
+
+        # Verbose mode adds two new productions:
+        # Adaptively determined N; test loop multiple adjusting runtime to ~1s
+        re.compile(r'\s+Measuring with scale (\d+).'):
+        (lambda self, num_iters: setattr(self, 'num_iters', num_iters)),
+
+        re.compile(r'\s+Sample (\d+),(\d+)'):
+        (lambda self, i, runtime:
+         self.samples.append((int(i), int(self.num_iters), int(runtime)))
+        ),
+    }
+
+    def parse_results(self, lines):
+        """Parse results from the lines of the log output from Benchmark*.
+
+        Returns a list of `PerformanceTestResult`s.
+        """
+        for line in lines:
+            for regexp, action in LogParser.state_actions.items():
+                match = regexp.match(line)
+                if match:
+                    action(self, *match.groups())
+                    break  # stop after 1st match
+            else:  # If none matches, skip the line.
+                # print('skipping: ' + line.rstrip('\n'))
+                continue
+        return self.results
+
     @staticmethod
-    def load_from_csv(filename):  # handles output from Benchmark_O and
-        def skip_totals(row):     # Benchmark_Driver (added MAX_RSS column)
-            return len(row) > 7 and row[0].isdigit()
-        tests = map(PerformanceTestResult,
-                    filter(skip_totals, csv.reader(open(filename))))
+    def _results_from_lines(lines):
+        tests = LogParser().parse_results(lines)
 
         def add_or_merge(names, r):
             if r.name not in names:
@@ -147,8 +200,27 @@ def add_or_merge(names, r):
             else:
                 names[r.name].merge(r)
             return names
+
         return reduce(add_or_merge, tests, dict())
 
+    @staticmethod
+    def results_from_string(log_contents):
+        """Parse `PerformanceTestResult`s from the supplied string.
+
+        Returns dictionary of test names and `PerformanceTestResult`s.
+        """
+        return LogParser._results_from_lines(log_contents.splitlines())
+
+    @staticmethod
+    def results_from_file(log_file):
+        """Parse `PerformanceTestResult`s from the log file.
+
+        Returns dictionary of test names and `PerformanceTestResult`s.
+        """
+        with open(log_file) as f:
+            return LogParser._results_from_lines(f.readlines())
+
+
 class TestComparator(object):
     """Analyzes changes betweeen the old and new test results.
 
@@ -454,8 +526,8 @@ def parse_args(args):
 def main():
     """Compare benchmarks for changes in a formatted report."""
     args = parse_args(sys.argv[1:])
-    comparator = TestComparator(LogParser.load_from_csv(args.old_file),
-                                LogParser.load_from_csv(args.new_file),
+    comparator = TestComparator(LogParser.results_from_file(args.old_file),
+                                LogParser.results_from_file(args.new_file),
                                 args.delta_threshold)
     formatter = ReportFormatter(comparator, args.old_branch, args.new_branch,
                                 args.changes_only, args.single_table)
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 93fe0053fdf26..cf8ab7e00b14d 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -173,25 +173,66 @@ def assert_report_contains(self, texts, report):
             self.assertIn(text, report)
 
 
-class TestLogParser(FileSystemIntegration):
-    def test_load_from_csv(self):
+class TestLogParser(unittest.TestCase):
+    def test_parse_results_csv(self):
         """Ignores header row, empty lines and Totals row."""
-        log_file = self.write_temp_file('log.log',"""
-#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
+        log = """#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
 34,BitCount,20,3,4,4,0,4
 
 Totals,269
-""")
-        results = LogParser.load_from_csv(log_file)
-        self.assertEquals(results.keys(), ['BitCount'])
-        self.assertTrue(isinstance(results['BitCount'], PerformanceTestResult))
+"""
+        parser = LogParser()
+        results = parser.parse_results(log.splitlines())
+        self.assertTrue(isinstance(results[0], PerformanceTestResult))
+        self.assertEquals(results[0].name, 'BitCount')
+
+    def test_parse_results_verbose(self):
+        """Parse multiple performance test results with 2 sample formats:
+        single line for N = 1; two lines for N > 1.
+        """
+        verbose_log = """--- DATA ---
+#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
+Running AngryPhonebook for 3 samples.
+    Measuring with scale 78.
+    Sample 0,11812
+    Measuring with scale 90.
+    Sample 1,13898
+    Measuring with scale 91.
+    Sample 2,11467
+1,AngryPhonebook,3,11467,13898,12392,1315,11812
+Running Array2D for 3 samples.
+    Sample 0,369900
+    Sample 1,381039
+    Sample 2,371043
+3,Array2D,3,369900,381039,373994,6127,371043
+
+Totals,2"""
+        parser = LogParser()
+        results = parser.parse_results(verbose_log.split('\n'))
 
-    def test_merge(self):
-        concatenated_logs = self.write_temp_file('concat.log', """
-4,ArrayAppend,20,23641,29000,24990,0,24990
-4,ArrayAppend,1,20000,20000,20000,0,20000
-""")
-        results = LogParser.load_from_csv(concatenated_logs)
+        r = results[0]
+        self.assertEquals(
+            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
+            ('AngryPhonebook', 11467, 13898, 12392, 1315, 11812)
+        )
+        self.assertEquals(r.samples, len(r.all_samples))
+        self.assertEquals(results[0].all_samples,
+                          [(0, 78, 11812), (1, 90, 13898), (2, 91, 11467)])
+
+        r = results[1]
+        self.assertEquals(
+            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
+            ('Array2D', 369900, 381039, 373994, 6127, 371043)
+        )
+        self.assertEquals(r.samples, len(r.all_samples))
+        self.assertEquals(results[1].all_samples,
+                          [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
+
+    def test_results_from_merge(self):
+        """Parsing concatenated log merges same PerformanceTestResults"""
+        concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990
+4,ArrayAppend,1,20000,20000,20000,0,20000"""
+        results = LogParser.results_from_string(concatenated_logs)
         self.assertEquals(results.keys(), ['ArrayAppend'])
         result = results['ArrayAppend']
         self.assertTrue(isinstance(result, PerformanceTestResult))

From c60e223a3b518696ba6df79b26519a51305231d9 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Mon, 13 Aug 2018 17:31:06 +0200
Subject: [PATCH 10/20] [benchmark] LogParser: tab & space delimited logs

Added support for tab delimited and formatted log output (space aligned columns as output to console by Benchmark_Driver).
---
 benchmark/scripts/compare_perf_tests.py      | 10 +++++++---
 benchmark/scripts/test_compare_perf_tests.py | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index b563cb059d6b8..cad0e88193b38 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -149,11 +149,15 @@ def __init__(self):
 
     # Parse lines like this
     # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
-    results_re = re.compile(r'(\d+,[ \t]*\w+,' +
-                            ','.join([r'[ \t]*[\d.]+'] * 6) + ')')
+    results_re = re.compile(r'(\d+[, \t]*\w+[, \t]*' +
+                            r'[, \t]*'.join([r'[\d.]+'] * 6) +
+                            r'[, \t]*[\d.]*)')  # optional MAX_RSS(B)
 
     def _append_result(self, result):
-        r = PerformanceTestResult(result.split(','))
+        columns = result.split(',')
+        if len(columns) < 8:
+            columns = result.split()
+        r = PerformanceTestResult(columns)
         if self.samples:
             r.all_samples = self.samples
         self.results.append(r)
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index cf8ab7e00b14d..8a0b829a53fc7 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -186,6 +186,26 @@ def test_parse_results_csv(self):
         self.assertTrue(isinstance(results[0], PerformanceTestResult))
         self.assertEquals(results[0].name, 'BitCount')
 
+    def test_parse_results_tab_delimited(self):
+        log = '34\tBitCount\t20\t3\t4\t4\t0\t4'
+        parser = LogParser()
+        results = parser.parse_results(log.splitlines())
+        self.assertTrue(isinstance(results[0], PerformanceTestResult))
+        self.assertEquals(results[0].name, 'BitCount')
+
+    def test_parse_results_formatted_text(self):
+        """Parse format that Benchmark_Driver prints to console"""
+        log = ("""
+# TEST      SAMPLES MIN(μs) MAX(μs) MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)
+3 Array2D        20    2060    2188     2099      0       2099   20915200
+Totals        281 2693794 2882846  2748843      0          0          0""")
+        parser = LogParser()
+        results = parser.parse_results(log.splitlines()[1:])  # without 1st \n
+        self.assertTrue(isinstance(results[0], PerformanceTestResult))
+        r = results[0]
+        self.assertEquals(r.name, 'Array2D')
+        self.assertEquals(r.max_rss, 20915200)
+
     def test_parse_results_verbose(self):
         """Parse multiple performance test results with 2 sample formats:
         single line for N = 1; two lines for N > 1.

From bea35cb7c15aeb963912fbd873b6f9773be2988f Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Mon, 13 Aug 2018 17:33:02 +0200
Subject: [PATCH 11/20] [benchmark] LogParser measure environment

Measure more of environment during test

In addition to measuring maximum resident set size, also extract number of voluntary and involuntary context switches from the verbose mode.
---
 benchmark/scripts/compare_perf_tests.py      | 32 ++++++++++++++++++--
 benchmark/scripts/test_compare_perf_tests.py | 16 ++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index cad0e88193b38..df3d9713a723d 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -145,7 +145,14 @@ class LogParser(object):
 
     def __init__(self):
         """Create instance of `LogParser`."""
-        self.results, self.samples, self.num_iters = [], [], 1
+        self.results = []
+        self._reset()
+
+    def _reset(self):
+        """Reset parser to the default state for reading a new result."""
+        self.samples, self.num_iters = [], 1
+        self.max_rss, self.mem_pages = None, None
+        self.voluntary_cs, self.involuntary_cs = None, None
 
     # Parse lines like this
     # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
@@ -158,16 +165,25 @@ def _append_result(self, result):
         if len(columns) < 8:
             columns = result.split()
         r = PerformanceTestResult(columns)
+        if self.max_rss:
+            r.max_rss = self.max_rss
+            r.mem_pages = self.mem_pages
+            r.voluntary_cs = self.voluntary_cs
+            r.involuntary_cs = self.involuntary_cs
         if self.samples:
             r.all_samples = self.samples
         self.results.append(r)
-        self.num_iters, self.samples = 1, []
+        self._reset()
+
+    def _store_memory_stats(self, max_rss, mem_pages):
+        self.max_rss = int(max_rss)
+        self.mem_pages = int(mem_pages)
 
     # Regular expression and action to take when it matches the parsed line
     state_actions = {
         results_re: _append_result,
 
-        # Verbose mode adds two new productions:
+        # Verbose mode adds new productions:
         # Adaptively determined N; test loop multiple adjusting runtime to ~1s
         re.compile(r'\s+Measuring with scale (\d+).'):
         (lambda self, num_iters: setattr(self, 'num_iters', num_iters)),
@@ -176,6 +192,16 @@ def _append_result(self, result):
         (lambda self, i, runtime:
          self.samples.append((int(i), int(self.num_iters), int(runtime)))
         ),
+
+        # Environmental statistics: memory usage and context switches
+        re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
+        _store_memory_stats,
+
+        re.compile(r'\s+VCS \d+ - \d+ = (\d+)'):
+        (lambda self, vcs: setattr(self, 'voluntary_cs', int(vcs))),
+
+        re.compile(r'\s+ICS \d+ - \d+ = (\d+)'):
+        (lambda self, ics: setattr(self, 'involuntary_cs', int(ics))),
     }
 
     def parse_results(self, lines):
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 8a0b829a53fc7..f484861db4195 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -248,6 +248,22 @@ def test_parse_results_verbose(self):
         self.assertEquals(results[1].all_samples,
                           [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
 
+    def test_parse_environment_verbose(self):
+        """Parse stats about environment in verbose mode."""
+        verbose_log = """    MAX_RSS 8937472 - 8904704 = 32768 (8 pages)
+    ICS 1338 - 229 = 1109
+    VCS 2 - 1 = 1
+2,AngryPhonebook,3,11269,11884,11657,338,11820
+"""
+        parser = LogParser()
+        results = parser.parse_results(verbose_log.split('\n'))
+
+        r = results[0]
+        self.assertEquals(r.max_rss, 32768)
+        self.assertEquals(r.mem_pages, 8)
+        self.assertEquals(r.voluntary_cs, 1)
+        self.assertEquals(r.involuntary_cs, 1109)
+
     def test_results_from_merge(self):
         """Parsing concatenated log merges same PerformanceTestResults"""
         concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990

From 91077e32896238b61bf82befc0d9a178f2c6807c Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 16 Aug 2018 13:44:02 +0200
Subject: [PATCH 12/20] [benchmark] Introduced PerformanceTestSamples

* Moved the functionality to compute median, standard deviation and related statistics from `PerformanceTestResult` into `PerformanceTestSamples`.
* Fixed wrong unit in comments
---
 benchmark/scripts/compare_perf_tests.py      | 171 +++++++++++++------
 benchmark/scripts/test_compare_perf_tests.py | 133 +++++++++++++--
 2 files changed, 235 insertions(+), 69 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index df3d9713a723d..d72d449192b6d 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -23,6 +23,90 @@
 import re
 import sys
 from math import sqrt
+from collections import namedtuple
+
+
+class Sample(namedtuple('Sample', 'i num_iters runtime')):
+    u"""Single benchmark measurement.
+
+    Initialized with:
+    `i`: ordinal number of the sample taken,
+    `num-num_iters`:  number or iterations used to compute it,
+    `runtime`: in microseconds (μs).
+    """
+
+    def __repr__(self):
+        """Shorter Sample formating for debugging purposes."""
+        return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)
+
+
+class PerformanceTestSamples(object):
+    """Collection of runtime samples from the benchmark execution.
+
+    Computes the sample population statistics.
+    """
+
+    def __init__(self, name, samples=None):
+        """Initialized with benchmark name and optional list of Samples."""
+        self.name = name  # Name of the performance test
+        self.samples = []
+        self.mean = 0.0
+        self.S_runtime = 0.0  # For computing running variance
+        for sample in samples or []:
+            self.add(sample)
+
+    def add(self, sample):
+        """Add sample to collection and recompute statistics."""
+        assert isinstance(sample, Sample)
+        state = (self.count, self.mean, self.S_runtime)
+        state = self.running_mean_variance(state, sample.runtime)
+        _, self.mean, self.S_runtime = state
+        self.samples.append(sample)
+        self.samples.sort(key=lambda s: s.runtime)
+
+    @property
+    def count(self):
+        """Number of samples used to compute the statistics."""
+        return len(self.samples)
+
+    @property
+    def min(self):
+        """Minimum sampled value."""
+        return self.samples[0].runtime
+
+    @property
+    def max(self):
+        """Maximum sampled value."""
+        return self.samples[-1].runtime
+
+    @property
+    def median(self):
+        """Median sampled value."""
+        return self.samples[self.count / 2].runtime
+
+    @property
+    def sd(self):
+        u"""Standard Deviation (μs)."""
+        return (0 if self.count < 2 else
+                sqrt(self.S_runtime / (self.count - 1)))
+
+    @staticmethod
+    def running_mean_variance((k, M_, S_), x):
+        """Compute running variance, B. P. Welford's method.
+
+        See Knuth TAOCP vol 2, 3rd edition, page 232, or
+        https://www.johndcook.com/blog/standard_deviation/
+        M is mean, Standard Deviation is defined as sqrt(S/k-1)
+        """
+        k = float(k + 1)
+        M = M_ + (x - M_) / k
+        S = S_ + (x - M_) * (x - M)
+        return (k, M, S)
+
+    @property
+    def cv(self):
+        """Coeficient of Variation (%)."""
+        return (self.sd / self.mean) if self.mean else 0
 
 
 class PerformanceTestResult(object):
@@ -46,67 +130,48 @@ def __init__(self, csv_row):
         """
         # csv_row[0] is just an ordinal number of the test - skip that
         self.name = csv_row[1]          # Name of the performance test
-        self.samples = int(csv_row[2])  # Number of measurement samples taken
-        self.min = int(csv_row[3])      # Minimum runtime (ms)
-        self.max = int(csv_row[4])      # Maximum runtime (ms)
-        self.mean = int(csv_row[5])     # Mean (average) runtime (ms)
-        sd = int(csv_row[6])            # Standard Deviation (ms)
-        # For computing running variance
-        self.S_runtime = (0 if self.samples < 2 else
-                          (sd * sd) * (self.samples - 1))
-        self.median = int(csv_row[7])   # Median runtime (ms)
+        self.num_samples = (            # Number of measurement samples taken
+            int(csv_row[2]))
+        self.min = int(csv_row[3])      # Minimum runtime (μs)
+        self.max = int(csv_row[4])      # Maximum runtime (μs)
+        self.mean = float(csv_row[5])   # Mean (average) runtime (μs)
+        self.sd = float(csv_row[6])     # Standard Deviation (μs)
+        self.median = int(csv_row[7])   # Median runtime (μs)
         self.max_rss = (                # Maximum Resident Set Size (B)
             int(csv_row[8]) if len(csv_row) > 8 else None)
-        # Sample lists for statistical analysis of measured results
-        self.all_samples = None
+        self.samples = None
 
     def __repr__(self):
         """Short summary for debugging purposes."""
         return (
             '<PerformanceTestResult name:{0.name!r} '
-            'samples:{0.samples!r} min:{0.min!r} max:{0.max!r} '
-            'mean:{0.mean!r} sd:{0.sd!r} median:{0.median!r}>'.format(self))
-
-    @property
-    def sd(self):
-        """Standard Deviation (ms)"""
-        return (0 if self.samples < 2 else
-                sqrt(self.S_runtime / (self.samples - 1)))
-
-    @staticmethod
-    def running_mean_variance((k, M_, S_), x):
-        """
-        Compute running variance, B. P. Welford's method
-        See Knuth TAOCP vol 2, 3rd edition, page 232, or
-        https://www.johndcook.com/blog/standard_deviation/
-        M is mean, Standard Deviation is defined as sqrt(S/k-1)
-        """
-        k = float(k + 1)
-        M = M_ + (x - M_) / k
-        S = S_ + (x - M_) * (x - M)
-        return (k, M, S)
+            'samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} '
+            'mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>'
+            .format(self))
 
     def merge(self, r):
-        """Merging test results recomputes min and max.
-        It attempts to recompute mean and standard deviation when all_samples
-        are available. There is no correct way to compute these values from
-        test results that are summaries from more than 3 samples.
+        """Merge two results.
 
-        The use case here is comparing tests results parsed from concatenated
+        Recomputes min, max and mean statistics. If all `samples` are
+        avaliable, it recomputes all the statistics.
+        The use case here is comparing test results parsed from concatenated
         log files from multiple runs of benchmark driver.
         """
-        self.min = min(self.min, r.min)
-        self.max = max(self.max, r.max)
-        # self.median = None # unclear what to do here
-
-        def push(x):
-            state = (self.samples, self.mean, self.S_runtime)
-            state = self.running_mean_variance(state, x)
-            (self.samples, self.mean, self.S_runtime) = state
-
-        # Merging test results with up to 3 samples is exact
-        values = [r.min, r.max, r.median][:min(r.samples, 3)]
-        map(push, values)
+        if self.samples and r.samples:
+            map(self.samples.add, r.samples.samples)
+            sams = self.samples
+            self.num_samples = sams.count
+            self.min, self.max, self.median, self.mean, self.sd = \
+                sams.min, sams.max, sams.median, sams.mean, sams.sd
+        else:
+            self.min = min(self.min, r.min)
+            self.max = max(self.max, r.max)
+            self.mean = (  # pooled mean is the weighted sum of means
+                (self.mean * self.num_samples) + (r.mean * r.num_samples)
+            ) / float(self.num_samples + r.num_samples)
+            self.num_samples += r.num_samples
+            self.max_rss = min(self.max_rss, r.max_rss)
+            self.median, self.sd = 0, 0
 
 
 class ResultComparison(object):
@@ -119,7 +184,7 @@ def __init__(self, old, new):
         """Initialize with old and new `PerformanceTestResult`s to compare."""
         self.old = old
         self.new = new
-        assert(old.name == new.name)
+        assert old.name == new.name
         self.name = old.name  # Test name, convenience accessor
 
         # Speedup ratio
@@ -171,7 +236,7 @@ def _append_result(self, result):
             r.voluntary_cs = self.voluntary_cs
             r.involuntary_cs = self.involuntary_cs
         if self.samples:
-            r.all_samples = self.samples
+            r.samples = PerformanceTestSamples(r.name, self.samples)
         self.results.append(r)
         self._reset()
 
@@ -190,8 +255,8 @@ def _store_memory_stats(self, max_rss, mem_pages):
 
         re.compile(r'\s+Sample (\d+),(\d+)'):
         (lambda self, i, runtime:
-         self.samples.append((int(i), int(self.num_iters), int(runtime)))
-        ),
+         self.samples.append(
+             Sample(int(i), int(self.num_iters), int(runtime)))),
 
         # Environmental statistics: memory usage and context switches
         re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index f484861db4195..b13a621ec58b3 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -21,8 +21,10 @@
 
 from compare_perf_tests import LogParser
 from compare_perf_tests import PerformanceTestResult
+from compare_perf_tests import PerformanceTestSamples
 from compare_perf_tests import ReportFormatter
 from compare_perf_tests import ResultComparison
+from compare_perf_tests import Sample
 from compare_perf_tests import TestComparator
 from compare_perf_tests import main
 from compare_perf_tests import parse_args
@@ -30,14 +32,82 @@
 from test_utils import captured_output
 
 
-class TestPerformanceTestResult(unittest.TestCase):
+class TestSample(unittest.TestCase):
+    def test_has_named_fields(self):
+        s = Sample(1, 2, 3)
+        self.assertEquals(s.i, 1)
+        self.assertEquals(s.num_iters, 2)
+        self.assertEquals(s.runtime, 3)
+
+    def test_is_iterable(self):
+        s = Sample(1, 2, 3)
+        self.assertEquals(s[0], 1)
+        self.assertEquals(s[1], 2)
+        self.assertEquals(s[2], 3)
 
+
+class TestPerformanceTestSamples(unittest.TestCase):
+    def setUp(self):
+        self.rs = [Sample(*map(int, line.split())) for line in
+                   '0 316 233,'  # this is anomalous sample - max
+                   '1 4417 208, 2 4745 216, 3 4867 208, 4 4934 197,'
+                   '5 5209 205, 6 4271 204, 7 4971 208, 8 5276 206,'
+                   '9 4596 221, 10 5278 198'.split(',')]
+        self.samples = PerformanceTestSamples('DropFirstAnyCollection')
+        self.samples.add(self.rs[1])
+
+    def test_has_name(self):
+        self.assertEquals(self.samples.name, 'DropFirstAnyCollection')
+
+    def test_stores_samples(self):
+        self.assertEquals(self.samples.count, 1)
+        s = self.samples.samples[0]
+        self.assertTrue(isinstance(s, Sample))
+        self.assertEquals(s.i, 1)
+        self.assertEquals(s.num_iters, 4417)
+        self.assertEquals(s.runtime, 208)
+
+    def test_computes_min_max_median(self):
+        self.assertEquals(self.samples.min, 208)
+        self.assertEquals(self.samples.max, 208)
+        self.assertEquals(self.samples.median, 208)
+        self.samples.add(self.rs[2])
+        self.assertEquals(self.samples.min, 208)
+        self.assertEquals(self.samples.max, 216)
+        self.assertEquals(self.samples.median, 216)
+        self.samples.add(self.rs[4])
+        self.assertEquals(self.samples.min, 197)
+        self.assertEquals(self.samples.max, 216)
+        self.assertEquals(self.samples.median, 208)
+
+    def assertEqualStats(self, expected_stats):
+        stats = (self.samples.mean, self.samples.sd, self.samples.cv)
+        for actual, expected in zip(stats, expected_stats):
+            self.assertAlmostEquals(actual, expected, places=2)
+
+    def test_computes_mean_sd_cv(self):
+        self.assertEqualStats((208.0, 0.0, 0.0))
+        self.samples.add(self.rs[2])
+        self.assertEqualStats((212.0, 5.66, 2.67 / 100))
+        self.samples.add(self.rs[3])
+        self.assertEqualStats((210.67, 4.62, 2.19 / 100))
+
+    def test_init_with_samples(self):
+        ss = PerformanceTestSamples('Lots', self.rs[1:])
+        self.assertEquals(ss.count, 10)
+        self.samples = ss
+        self.assertEqualStats((207.10, 7.26, 3.51 / 100))
+
+
+class TestPerformanceTestResult(unittest.TestCase):
     def test_init(self):
         log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
         r = PerformanceTestResult(log_line.split(','))
         self.assertEquals(r.name, 'AngryPhonebook')
-        self.assertEquals((r.samples, r.min, r.max, r.mean, r.sd, r.median),
-                          (20, 10664, 12933, 11035, 576, 10884))
+        self.assertEquals(
+            (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
+            (20, 10664, 12933, 11035, 576, 10884))
+        self.assertEquals(r.samples, None)
 
         log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
         r = PerformanceTestResult(log_line.split(','))
@@ -49,7 +119,7 @@ def test_repr(self):
         self.assertEquals(
             str(r),
             '<PerformanceTestResult name:\'AngryPhonebook\' samples:20 '
-            'min:10664 max:12933 mean:11035 sd:576.0 median:10884>'
+            'min:10664 max:12933 mean:11035 sd:576 median:10884>'
         )
 
     def test_merge(self):
@@ -61,21 +131,21 @@ def test_merge(self):
                       [line.split(',') for line in tests])
 
         def as_tuple(r):
-            return (r.min, r.max, round(r.mean, 2), round(r.sd, 2), r.median,
-                    r.max_rss)
+            return (r.num_samples, r.min, r.max, round(r.mean, 2),
+                    r.sd, r.median, r.max_rss)
 
         r = results[0]
         self.assertEquals(as_tuple(r),
-                          (12045, 12045, 12045, 0, 12045, 10510336))
+                          (1, 12045, 12045, 12045, 0, 12045, 10510336))
         r.merge(results[1])
-        self.assertEquals(as_tuple(r),
-                          (12045, 12325, 12185, 197.99, 12045, 10510336))
+        self.assertEquals(as_tuple(r),  # drops SD and median
+                          (2, 12045, 12325, 12185, 0, 0, 10510336))
         r.merge(results[2])
-        self.assertEquals(as_tuple(r),
-                          (11616, 12325, 11995.33, 357.10, 12045, 10510336))
+        self.assertEquals(as_tuple(r),  # picks smaller of the MAX_RSS
+                          (3, 11616, 12325, 11995.33, 0, 0, 10502144))
         r.merge(results[3])
         self.assertEquals(as_tuple(r),
-                          (11616, 12325, 12064, 322.29, 12045, 10510336))
+                          (4, 11616, 12325, 12064, 0, 0, 10498048))
 
 
 class TestResultComparison(unittest.TestCase):
@@ -235,8 +305,9 @@ def test_parse_results_verbose(self):
             (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
             ('AngryPhonebook', 11467, 13898, 12392, 1315, 11812)
         )
-        self.assertEquals(r.samples, len(r.all_samples))
-        self.assertEquals(results[0].all_samples,
+        self.assertEquals(r.num_samples, r.samples.count)
+        self.assertEquals(sorted(results[0].samples.samples,
+                                 key=lambda s: s.i),
                           [(0, 78, 11812), (1, 90, 13898), (2, 91, 11467)])
 
         r = results[1]
@@ -244,8 +315,9 @@ def test_parse_results_verbose(self):
             (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
             ('Array2D', 369900, 381039, 373994, 6127, 371043)
         )
-        self.assertEquals(r.samples, len(r.all_samples))
-        self.assertEquals(results[1].all_samples,
+        self.assertEquals(r.num_samples, r.samples.count)
+        self.assertEquals(sorted(results[1].samples.samples,
+                                 key=lambda s: s.i),
                           [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
 
     def test_parse_environment_verbose(self):
@@ -275,6 +347,35 @@ def test_results_from_merge(self):
         self.assertEquals(result.min, 20000)
         self.assertEquals(result.max, 29000)
 
+    def test_results_from_merge_verbose(self):
+        """Parsing verbose log  merges all PerformanceTestSamples.
+        ...this should technically be on TestPerformanceTestResult, but it's
+        easier to write here. ¯\_(ツ)_/¯"""
+        concatenated_logs = """
+    Sample 0,355883
+    Sample 1,358817
+    Sample 2,353552
+    Sample 3,350815
+3,Array2D,4,350815,358817,354766,3403,355883
+    Sample 0,363094
+    Sample 1,369169
+    Sample 2,376131
+    Sample 3,364245
+3,Array2D,4,363094,376131,368159,5931,369169"""
+        results = LogParser.results_from_string(concatenated_logs)
+        self.assertEquals(results.keys(), ['Array2D'])
+        result = results['Array2D']
+        self.assertTrue(isinstance(result, PerformanceTestResult))
+        self.assertEquals(result.min, 350815)
+        self.assertEquals(result.max, 376131)
+        self.assertEquals(result.median, 363094)
+        self.assertAlmostEquals(result.sd, 8443.37, places=2)
+        self.assertAlmostEquals(result.mean, 361463.25, places=2)
+        self.assertEquals(result.num_samples, 8)
+        samples = result.samples
+        self.assertTrue(isinstance(samples, PerformanceTestSamples))
+        self.assertEquals(samples.count, 8)
+
 
 class TestTestComparator(OldAndNewLog):
     def test_init(self):

From 27cc77c59097014c47a612370e8b2ed23d32e525 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Fri, 17 Aug 2018 00:39:29 +0200
Subject: [PATCH 13/20] [benchmark] Exclude outliers from samples

Introduce algorithm for excluding of outliers after collecting all samples using the Interquartile Range rule.

The `exclude_outliers` method uses 1st and 3rd Quartile to compute Interquartile Range, then uses inner fences at Q1 - 1.5*IQR and Q3 + 1.5*IQR to remove samples outside this fence.

Based on experiments with collecting hundreads and thousands of samples (`num_samples`) per test with low iteration count (`num_iters`) with ~1s runtime, this rule is very effective in providing much better quality of sample population, effectively removing short environmental fluctuations that were previously averaged into the overall result (by the adaptively determined `num_iters` to run for ~1s), enlarging the reported result with these measurement errors. This technique can be used for some benchmarks, to get more stable results faster than before.

This outlier filering is employed when parsing `--verbose` test results.
---
 benchmark/scripts/compare_perf_tests.py      |  91 +++++++++-
 benchmark/scripts/test_compare_perf_tests.py | 176 ++++++++++++++-----
 2 files changed, 218 insertions(+), 49 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index d72d449192b6d..fdc0e19cb95df 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -22,8 +22,9 @@
 import argparse
 import re
 import sys
-from math import sqrt
+from bisect import bisect, bisect_left, bisect_right
 from collections import namedtuple
+from math import sqrt
 
 
 class Sample(namedtuple('Sample', 'i num_iters runtime')):
@@ -50,25 +51,75 @@ def __init__(self, name, samples=None):
         """Initialized with benchmark name and optional list of Samples."""
         self.name = name  # Name of the performance test
         self.samples = []
+        self.outliers = []
+        self._runtimes = []
         self.mean = 0.0
         self.S_runtime = 0.0  # For computing running variance
         for sample in samples or []:
             self.add(sample)
 
+    def __str__(self):
+        """Text summary of benchmark statisctics."""
+        return (
+            '{0.name!s} n={0.count!r} '
+            'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
+            'Max={0.max!r} '
+            'R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} '
+            'Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}'
+            .format(self) if self.samples else
+            '{0.name!s} n=0'.format(self))
+
     def add(self, sample):
         """Add sample to collection and recompute statistics."""
         assert isinstance(sample, Sample)
-        state = (self.count, self.mean, self.S_runtime)
-        state = self.running_mean_variance(state, sample.runtime)
-        _, self.mean, self.S_runtime = state
-        self.samples.append(sample)
-        self.samples.sort(key=lambda s: s.runtime)
+        self._update_stats(sample)
+        i = bisect(self._runtimes, sample.runtime)
+        self._runtimes.insert(i, sample.runtime)
+        self.samples.insert(i, sample)
+
+    def _update_stats(self, sample):
+        old_stats = (self.count, self.mean, self.S_runtime)
+        _, self.mean, self.S_runtime = (
+            self.running_mean_variance(old_stats, sample.runtime))
+
+    def exclude_outliers(self):
+        """Exclude outliers by applying Interquartile Range Rule.
+
+        Moves the samples outside of the inner fences
+        (Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes
+        statistics for the remaining sample population. Optionally apply
+        only the top inner fence, preserving the small outliers.
+
+        Experimentally, this rule seems to perform well-enough on the
+        benchmark runtimes in the microbenchmark range to filter out
+        the environment noise caused by preemtive multitasking.
+        """
+        lo = bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))
+        hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
+
+        outliers = self.samples[:lo] + self.samples[hi:]
+        samples = self.samples[lo:hi]
+
+        self.__init__(self.name)  # re-initialize
+        for sample in samples:  # and
+            self.add(sample)  # re-compute stats
+        self.outliers = outliers
 
     @property
     def count(self):
         """Number of samples used to compute the statistics."""
         return len(self.samples)
 
+    @property
+    def num_samples(self):
+        """Number of all samples in the collection."""
+        return len(self.samples) + len(self.outliers)
+
+    @property
+    def all_samples(self):
+        """List of all samples in ascending order."""
+        return sorted(self.samples + self.outliers, key=lambda s: s.i)
+
     @property
     def min(self):
         """Minimum sampled value."""
@@ -84,6 +135,21 @@ def median(self):
         """Median sampled value."""
         return self.samples[self.count / 2].runtime
 
+    @property
+    def q1(self):
+        """First Quartile (25th Percentile)."""
+        return self.samples[self.count / 4].runtime
+
+    @property
+    def q3(self):
+        """Third Quartile (75th Percentile)."""
+        return self.samples[(self.count / 2) + (self.count / 4)].runtime
+
+    @property
+    def iqr(self):
+        """Interquartile Range."""
+        return self.q3 - self.q1
+
     @property
     def sd(self):
         u"""Standard Deviation (μs)."""
@@ -108,6 +174,16 @@ def cv(self):
         """Coeficient of Variation (%)."""
         return (self.sd / self.mean) if self.mean else 0
 
+    @property
+    def range(self):
+        """Range of samples values (Max - Min)."""
+        return self.max - self.min
+
+    @property
+    def spread(self):
+        """Sample Spread; i.e. Range as (%) of Min."""
+        return self.range / float(self.min) if self.min else 0
+
 
 class PerformanceTestResult(object):
     u"""Result from executing an individual Swift Benchmark Suite benchmark.
@@ -160,7 +236,7 @@ def merge(self, r):
         if self.samples and r.samples:
             map(self.samples.add, r.samples.samples)
             sams = self.samples
-            self.num_samples = sams.count
+            self.num_samples = sams.num_samples
             self.min, self.max, self.median, self.mean, self.sd = \
                 sams.min, sams.max, sams.median, sams.mean, sams.sd
         else:
@@ -237,6 +313,7 @@ def _append_result(self, result):
             r.involuntary_cs = self.involuntary_cs
         if self.samples:
             r.samples = PerformanceTestSamples(r.name, self.samples)
+            r.samples.exclude_outliers()
         self.results.append(r)
         self._reset()
 
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index b13a621ec58b3..5bc805612c5f4 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -48,55 +48,119 @@ def test_is_iterable(self):
 
 class TestPerformanceTestSamples(unittest.TestCase):
     def setUp(self):
-        self.rs = [Sample(*map(int, line.split())) for line in
-                   '0 316 233,'  # this is anomalous sample - max
-                   '1 4417 208, 2 4745 216, 3 4867 208, 4 4934 197,'
-                   '5 5209 205, 6 4271 204, 7 4971 208, 8 5276 206,'
-                   '9 4596 221, 10 5278 198'.split(',')]
-        self.samples = PerformanceTestSamples('DropFirstAnyCollection')
-        self.samples.add(self.rs[1])
+        self.samples = PerformanceTestSamples('B1')
+        self.samples.add(Sample(7, 42, 1000))
 
     def test_has_name(self):
-        self.assertEquals(self.samples.name, 'DropFirstAnyCollection')
+        self.assertEquals(self.samples.name, 'B1')
 
     def test_stores_samples(self):
         self.assertEquals(self.samples.count, 1)
         s = self.samples.samples[0]
         self.assertTrue(isinstance(s, Sample))
-        self.assertEquals(s.i, 1)
-        self.assertEquals(s.num_iters, 4417)
-        self.assertEquals(s.runtime, 208)
-
-    def test_computes_min_max_median(self):
-        self.assertEquals(self.samples.min, 208)
-        self.assertEquals(self.samples.max, 208)
-        self.assertEquals(self.samples.median, 208)
-        self.samples.add(self.rs[2])
-        self.assertEquals(self.samples.min, 208)
-        self.assertEquals(self.samples.max, 216)
-        self.assertEquals(self.samples.median, 216)
-        self.samples.add(self.rs[4])
-        self.assertEquals(self.samples.min, 197)
-        self.assertEquals(self.samples.max, 216)
-        self.assertEquals(self.samples.median, 208)
-
-    def assertEqualStats(self, expected_stats):
-        stats = (self.samples.mean, self.samples.sd, self.samples.cv)
+        self.assertEquals(s.i, 7)
+        self.assertEquals(s.num_iters, 42)
+        self.assertEquals(s.runtime, 1000)
+
+    def assertEqualFiveNumberSummary(self, ss, expected_fns):
+        e_min, e_q1, e_median, e_q3, e_max = expected_fns
+        self.assertEquals(ss.min, e_min)
+        self.assertEquals(ss.q1, e_q1)
+        self.assertEquals(ss.median, e_median)
+        self.assertEquals(ss.q3, e_q3)
+        self.assertEquals(ss.max, e_max)
+
+    def test_computes_five_number_summary(self):
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1000, 1000, 1000))
+        self.samples.add(Sample(2, 1, 1100))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1100, 1100, 1100))
+        self.samples.add(Sample(3, 1, 1050))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1050, 1050, 1100))
+        self.samples.add(Sample(4, 1, 1025))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1025, 1050, 1100, 1100))
+        self.samples.add(Sample(5, 1, 1075))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1025, 1050, 1075, 1100))
+
+    def test_computes_inter_quartile_range(self):
+        self.assertEquals(self.samples.iqr, 0)
+        self.samples.add(Sample(2, 1, 1025))
+        self.samples.add(Sample(3, 1, 1050))
+        self.samples.add(Sample(4, 1, 1075))
+        self.samples.add(Sample(5, 1, 1100))
+        self.assertEquals(self.samples.iqr, 50)
+
+    def assertEqualStats(self, stats, expected_stats):
         for actual, expected in zip(stats, expected_stats):
             self.assertAlmostEquals(actual, expected, places=2)
 
     def test_computes_mean_sd_cv(self):
-        self.assertEqualStats((208.0, 0.0, 0.0))
-        self.samples.add(self.rs[2])
-        self.assertEqualStats((212.0, 5.66, 2.67 / 100))
-        self.samples.add(self.rs[3])
-        self.assertEqualStats((210.67, 4.62, 2.19 / 100))
+        ss = self.samples
+        self.assertEqualStats(
+            (ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
+        self.samples.add(Sample(2, 1, 1100))
+        self.assertEqualStats(
+            (ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
+
+    def test_computes_range_spread(self):
+        ss = self.samples
+        self.assertEqualStats(
+            (ss.range, ss.spread), (0, 0))
+        self.samples.add(Sample(2, 1, 1100))
+        self.assertEqualStats(
+            (ss.range, ss.spread), (100, 10.0 / 100))
 
     def test_init_with_samples(self):
-        ss = PerformanceTestSamples('Lots', self.rs[1:])
-        self.assertEquals(ss.count, 10)
-        self.samples = ss
-        self.assertEqualStats((207.10, 7.26, 3.51 / 100))
+        self.samples = PerformanceTestSamples(
+            'B2', [Sample(0, 1, 1000), Sample(1, 1, 1100)])
+        self.assertEquals(self.samples.count, 2)
+        self.assertEqualStats(
+            (self.samples.mean, self.samples.sd,
+             self.samples.range, self.samples.spread),
+            (1050.0, 70.71, 100, 9.52 / 100))
+
+    def test_can_handle_zero_runtime(self):
+        # guard against dividing by 0
+        self.samples = PerformanceTestSamples('Zero')
+        self.samples.add(Sample(0, 1, 0))
+        self.assertEqualStats(
+            (self.samples.mean, self.samples.sd, self.samples.cv,
+             self.samples.range, self.samples.spread),
+            (0, 0, 0.0, 0, 0.0))
+
+    def test_excludes_outliers(self):
+        ss = [Sample(*map(int, line.split())) for line in
+              '0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, '
+              '5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, '
+              '10 1 1050, 11 1 949, 12 1 1151'.split(',')]
+        self.samples = PerformanceTestSamples('Outliers', ss)
+        self.assertEquals(self.samples.count, 13)
+        self.assertEqualStats(
+            (self.samples.mean, self.samples.sd), (1050, 52.36))
+
+        self.samples.exclude_outliers()
+
+        self.assertEquals(self.samples.count, 11)
+        self.assertEquals(self.samples.outliers, ss[11:])
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1025, 1050, 1075, 1100))
+        self.assertEqualStats(
+            (self.samples.mean, self.samples.sd), (1050, 35.36))
+
+    def test_excludes_outliers_zero_IQR(self):
+        self.samples = PerformanceTestSamples('Tight')
+        self.samples.add(Sample(0, 2, 23))
+        self.samples.add(Sample(1, 2, 18))
+        self.samples.add(Sample(2, 2, 18))
+        self.assertEquals(self.samples.iqr, 0)
+
+        self.samples.exclude_outliers()
+
+        self.assertEquals(self.samples.count, 2)
 
 
 class TestPerformanceTestResult(unittest.TestCase):
@@ -305,9 +369,8 @@ def test_parse_results_verbose(self):
             (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
             ('AngryPhonebook', 11467, 13898, 12392, 1315, 11812)
         )
-        self.assertEquals(r.num_samples, r.samples.count)
-        self.assertEquals(sorted(results[0].samples.samples,
-                                 key=lambda s: s.i),
+        self.assertEquals(r.num_samples, r.samples.num_samples)
+        self.assertEquals(results[0].samples.all_samples,
                           [(0, 78, 11812), (1, 90, 13898), (2, 91, 11467)])
 
         r = results[1]
@@ -315,9 +378,8 @@ def test_parse_results_verbose(self):
             (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
             ('Array2D', 369900, 381039, 373994, 6127, 371043)
         )
-        self.assertEquals(r.num_samples, r.samples.count)
-        self.assertEquals(sorted(results[1].samples.samples,
-                                 key=lambda s: s.i),
+        self.assertEquals(r.num_samples, r.samples.num_samples)
+        self.assertEquals(results[1].samples.all_samples,
                           [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
 
     def test_parse_environment_verbose(self):
@@ -376,6 +438,36 @@ def test_results_from_merge_verbose(self):
         self.assertTrue(isinstance(samples, PerformanceTestSamples))
         self.assertEquals(samples.count, 8)
 
+    def test_excludes_outliers_from_samples(self):
+        verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples.
+    Measuring with scale 2.
+    Sample 0,455
+    Measuring with scale 2.
+    Sample 1,203
+    Measuring with scale 2.
+    Sample 2,205
+    Measuring with scale 2.
+    Sample 3,207
+    Measuring with scale 2.
+    Sample 4,208
+    Measuring with scale 2.
+    Sample 5,206
+    Measuring with scale 2.
+    Sample 6,205
+    Measuring with scale 2.
+    Sample 7,206
+    Measuring with scale 2.
+    Sample 8,208
+    Measuring with scale 2.
+    Sample 9,184
+65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206
+"""
+        parser = LogParser()
+        result = parser.parse_results(verbose_log.split('\n'))[0]
+        self.assertEquals(result.num_samples, 10)
+        self.assertEquals(result.samples.count, 8)
+        self.assertEquals(len(result.samples.outliers), 2)
+
 
 class TestTestComparator(OldAndNewLog):
     def test_init(self):

From e80165f3164f953e7bb8449870603a4e9031064a Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 9 Aug 2018 06:33:58 +0200
Subject: [PATCH 14/20] [benchmark] Exclude only outliers from the top

Option to exclude the outliers only from top of the range, leaving in the outliers on the min side.
---
 benchmark/scripts/compare_perf_tests.py      |  5 +++--
 benchmark/scripts/test_compare_perf_tests.py | 16 +++++++++++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index fdc0e19cb95df..2c686f4553181 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -82,7 +82,7 @@ def _update_stats(self, sample):
         _, self.mean, self.S_runtime = (
             self.running_mean_variance(old_stats, sample.runtime))
 
-    def exclude_outliers(self):
+    def exclude_outliers(self, top_only=False):
         """Exclude outliers by applying Interquartile Range Rule.
 
         Moves the samples outside of the inner fences
@@ -94,7 +94,8 @@ def exclude_outliers(self):
         benchmark runtimes in the microbenchmark range to filter out
         the environment noise caused by preemtive multitasking.
         """
-        lo = bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))
+        lo = (0 if top_only else
+              bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr)))
         hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
 
         outliers = self.samples[:lo] + self.samples[hi:]
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 5bc805612c5f4..6bfe7092011c8 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -133,7 +133,7 @@ def test_can_handle_zero_runtime(self):
             (0, 0, 0.0, 0, 0.0))
 
     def test_excludes_outliers(self):
-        ss = [Sample(*map(int, line.split())) for line in
+        ss = [Sample(*map(int, s.split())) for s in
               '0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, '
               '5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, '
               '10 1 1050, 11 1 949, 12 1 1151'.split(',')]
@@ -161,6 +161,20 @@ def test_excludes_outliers_zero_IQR(self):
         self.samples.exclude_outliers()
 
         self.assertEquals(self.samples.count, 2)
+        self.assertEqualStats(
+            (self.samples.min, self.samples.max), (18, 18))
+
+    def test_excludes_outliers_top_only(self):
+        ss = [Sample(*map(int, s.split())) for s in
+              '0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3'.split(',')]
+        self.samples = PerformanceTestSamples('Top', ss)
+        self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3))
+        self.assertEquals(self.samples.iqr, 0)
+
+        self.samples.exclude_outliers(top_only=True)
+
+        self.assertEquals(self.samples.count, 4)
+        self.assertEqualStats((self.samples.min, self.samples.max), (1, 2))
 
 
 class TestPerformanceTestResult(unittest.TestCase):

From a84db8306234d73ed9a90df80fe082479162874c Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 16 Aug 2018 21:20:58 +0200
Subject: [PATCH 15/20] [benchmark] BenchmarkDriver can run tests

The `run` method on `BenchmarkDriver` invokes the test harness with specified number of iterations, samples. It supports mesuring memory use and in the verbose mode it also collects individual samples and monitors the system load by counting the number of voluntary and involuntary context switches.

Output is parsed using `LogParser` from `compare_perf_tests.py`. This makes that file a required dependency for the driver, therefore it is also copied to the bin directory during the build.
---
 benchmark/scripts/Benchmark_Driver         | 43 +++++++++++++++--
 benchmark/scripts/CMakeLists.txt           |  5 ++
 benchmark/scripts/test_Benchmark_Driver.py | 55 ++++++++++++++++++++++
 3 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index bb9d47968dc47..57592a7a39ef7 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -21,14 +21,25 @@ import subprocess
 import sys
 import time
 
+from compare_perf_tests import LogParser
+
 DRIVER_DIR = os.path.dirname(os.path.realpath(__file__))
 
 
 class BenchmarkDriver(object):
-    def __init__(self, args, tests=None, _subprocess=None):
+    """Executes tests from Swift Benchmark Suite."""
+
+    def __init__(self, args, tests=None, _subprocess=None, parser=None):
+        """Initialized with command line arguments.
+
+        Optional parameters for injecting dependencies; used for testing.
+        """
         self.args = args
         self._subprocess = _subprocess or subprocess
+        self.all_tests = []
         self.tests = tests or self._get_tests()
+        self.parser = parser or LogParser()
+        self.results = {}
 
     def _invoke(self, cmd):
         return self._subprocess.check_output(
@@ -36,8 +47,10 @@ class BenchmarkDriver(object):
 
     @property
     def test_harness(self):
-        return os.path.join(self.args.tests,
-                            "Benchmark_" + self.args.optimization)
+        """Full path to test harness binary."""
+        suffix = (self.args.optimization if hasattr(self.args, 'optimization')
+                  else 'O')
+        return os.path.join(self.args.tests, "Benchmark_" + suffix)
 
     @property
     def _cmd_list_benchmarks(self):
@@ -76,6 +89,30 @@ class BenchmarkDriver(object):
         return sorted(list(
             benchmarks.intersection(set(self.all_tests)).union(indexed_names)))
 
+    def run(self, test, num_samples=None, num_iters=None,
+            verbose=None, measure_memory=False):
+        """Execute benchmark and gather results."""
+        num_samples = num_samples or 1
+        num_iters = num_iters or 0  # automatically determine N to run for 1s
+
+        cmd = self._cmd_run(
+            test, num_samples, num_iters, verbose, measure_memory)
+        output = self._invoke(cmd)
+        result = self.parser.results_from_string(output).items()[0][1]
+        return result
+
+    def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory):
+        cmd = [self.test_harness, test]
+        if num_samples > 1:
+            cmd.append('--num-samples={0}'.format(num_samples))
+        if num_iters > 0:
+            cmd.append('--num-iters={0}'.format(num_iters))
+        if verbose:
+            cmd.append('--verbose')
+        if measure_memory:
+            cmd.append('--memory')
+        return cmd
+
 
 def instrument_test(driver_path, test, num_samples):
     """Run a test and instrument its peak memory use"""
diff --git a/benchmark/scripts/CMakeLists.txt b/benchmark/scripts/CMakeLists.txt
index 0fa44ab38d750..4922f6dd84031 100644
--- a/benchmark/scripts/CMakeLists.txt
+++ b/benchmark/scripts/CMakeLists.txt
@@ -42,3 +42,8 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/Benchmark_Driver
      DESTINATION "${swift-bin-dir}"
      FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ
      GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/compare_perf_tests.py
+     DESTINATION "${swift-bin-dir}"
+     FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ
+     GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index a8af2b5bf23b5..3755da95d05fd 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -19,6 +19,7 @@
 from imp import load_source
 
 from test_utils import Mock, captured_output
+from compare_perf_tests import PerformanceTestResult
 
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
 Benchmark_Driver = load_source(
@@ -162,6 +163,8 @@ def test_gets_list_of_all_benchmarks_when_benchmarks_args_exist(self):
             self.args, _subprocess=self.subprocess_mock)
         self.subprocess_mock.assert_called_all_expected()
         self.assertEquals(driver.tests, ['Benchmark1', 'Benchmark3'])
+        self.assertEquals(driver.all_tests,
+                          ['Benchmark1', 'Benchmark2', 'Benchmark3'])
 
     def test_filters_benchmarks_by_pattern(self):
         self.args.filters = '-f .+3'.split()
@@ -170,6 +173,58 @@ def test_filters_benchmarks_by_pattern(self):
             self.args, _subprocess=self.subprocess_mock)
         self.subprocess_mock.assert_called_all_expected()
         self.assertEquals(driver.tests, ['Benchmark3'])
+        self.assertEquals(driver.all_tests,
+                          ['Benchmark1', 'Benchmark2', 'Benchmark3'])
+
+
+class LogParserStub(object):
+    results_from_string_called = False
+
+    @staticmethod
+    def results_from_string(log_contents):
+        LogParserStub.results_from_string_called = True
+        r = PerformanceTestResult('3,b1,1,123,123,123,0,123'.split(','))
+        return {'b1': r}
+
+
+class TestBenchmarkDriverRunningTests(unittest.TestCase):
+    def setUp(self):
+        self.args = ArgsStub()
+        self.parser_stub = LogParserStub()
+        self.subprocess_mock = SubprocessMock()
+        self.subprocess_mock.expect(
+            '/benchmarks/Benchmark_O --list --delim=\t'.split(' '),
+            '#\tTest\t[Tags]\n1\tb1\t[tag]\n')
+        self.driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock,
+            parser=self.parser_stub)
+
+    def test_run_benchmark_with_multiple_samples(self):
+        self.driver.run('b1')
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b1'))
+        self.driver.run('b2', num_samples=5)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b2', '--num-samples=5'))
+
+    def test_run_benchmark_with_specified_number_of_iterations(self):
+        self.driver.run('b', num_iters=1)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--num-iters=1'))
+
+    def test_run_benchmark_in_verbose_mode(self):
+        self.driver.run('b', verbose=True)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--verbose'))
+
+    def test_parse_results_from_running_benchmarks(self):
+        self.driver.run('b')
+        self.assertTrue(self.parser_stub.results_from_string_called)
+
+    def test_measure_memory(self):
+        self.driver.run('b', measure_memory=True)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--memory'))
 
 
 if __name__ == '__main__':

From 076415f969e0aad9e4cf7c14e60f2c633f464708 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Fri, 17 Aug 2018 00:08:21 +0200
Subject: [PATCH 16/20] [benchmark] Strangler `run_benchmarks`

Replaced guts of the `run_benchmarks` function with implementation from `BenchmarDriver`. There was only single client which called it with `verbose=True`, so this parameter could be safely removed.

Function `instrument_test` is replaced by running the `Benchmark_0` with `--memory` option, which implements the MAX_RSS measurement while also excluding the overhead from the benchmarking infrastructure. The incorrect computation of standard deviation was simply dropped for measurements of more than one independent sample. Bogus aggregated `Totals` statistics were removed, now reporting only the total number of executed benchmarks.
---
 benchmark/scripts/Benchmark_Driver           | 100 ++++++-------------
 benchmark/scripts/compare_perf_tests.py      |   2 +-
 benchmark/scripts/test_Benchmark_Driver.py   |  39 +++++++-
 benchmark/scripts/test_compare_perf_tests.py |   3 +-
 4 files changed, 74 insertions(+), 70 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index 57592a7a39ef7..e351d09fd210f 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -113,44 +113,18 @@ class BenchmarkDriver(object):
             cmd.append('--memory')
         return cmd
 
+    def run_independent_samples(self, test):
+        """Run benchmark multiple times, gathering independent samples.
 
-def instrument_test(driver_path, test, num_samples):
-    """Run a test and instrument its peak memory use"""
-    test_outputs = []
-    for _ in range(num_samples):
-        test_output_raw = subprocess.check_output(
-            ['time', '-lp', driver_path, test],
-            stderr=subprocess.STDOUT
-        )
-        peak_memory = re.match('\s*(\d+)\s*maximum resident set size',
-                               test_output_raw.split('\n')[-15]).group(1)
-        test_outputs.append(test_output_raw.split()[1].split(',') +
-                            [peak_memory])
-
-    # Average sample results
-    num_samples_index = 2
-    min_index = 3
-    max_index = 4
-    avg_start_index = 5
-
-    # TODO: Correctly take stdev
-    avg_test_output = test_outputs[0]
-    avg_test_output[avg_start_index:] = map(int,
-                                            avg_test_output[avg_start_index:])
-    for test_output in test_outputs[1:]:
-        for i in range(avg_start_index, len(test_output)):
-            avg_test_output[i] += int(test_output[i])
-    for i in range(avg_start_index, len(avg_test_output)):
-        avg_test_output[i] = int(round(avg_test_output[i] /
-                                       float(len(test_outputs))))
-    avg_test_output[num_samples_index] = num_samples
-    avg_test_output[min_index] = min(
-        test_outputs, key=lambda x: int(x[min_index]))[min_index]
-    avg_test_output[max_index] = max(
-        test_outputs, key=lambda x: int(x[max_index]))[max_index]
-    avg_test_output = map(str, avg_test_output)
-
-    return avg_test_output
+        Returns the aggregated result of independent benchmark invocations.
+        """
+        def merge_results(a, b):
+            a.merge(b)
+            return a
+
+        return reduce(merge_results,
+                      [self.run(test, measure_memory=True)
+                       for _ in range(self.args.iterations)])
 
 
 def get_current_git_branch(git_repo_path):
@@ -197,11 +171,10 @@ def log_results(log_directory, driver, formatted_output, swift_repo=None):
         f.write(formatted_output)
 
 
-def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False,
+def run_benchmarks(driver,
                    log_directory=None, swift_repo=None):
     """Run perf tests individually and return results in a format that's
-    compatible with `parse_results`. If `benchmarks` is not empty,
-    only run tests included in it.
+    compatible with `LogParser`.
     """
     # Set a constant hash seed. Some tests are currently sensitive to
     # fluctuations in the number of hash collisions.
@@ -210,50 +183,43 @@ def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False,
     # that runs the tests.
     os.environ["SWIFT_DETERMINISTIC_HASHING"] = "1"
 
-    (total_tests, total_min, total_max, total_mean) = (0, 0, 0, 0)
     output = []
     headings = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)', 'MEAN(μs)',
                 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)']
     line_format = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}'
-    if verbose and log_directory:
+    if log_directory:
         print(line_format.format(*headings))
-    for test in benchmarks:
-        test_output = instrument_test(driver, test, num_samples)
-        if test_output[0] == 'Totals':
-            continue
-        if verbose:
-            if log_directory:
-                print(line_format.format(*test_output))
-            else:
-                print(','.join(test_output))
+    else:
+        print(','.join(headings))
+    for test in driver.tests:
+        r = driver.run_independent_samples(test)
+        test_output = map(str, [
+            r.test_num, r.name, r.num_samples, r.min, r.max, int(r.mean),
+            int(r.sd), r.median, r.max_rss])
+        if log_directory:
+            print(line_format.format(*test_output))
+        else:
+            print(','.join(test_output))
         output.append(test_output)
-        (samples, _min, _max, mean) = map(int, test_output[2:6])
-        total_tests += 1
-        total_min += _min
-        total_max += _max
-        total_mean += mean
     if not output:
         return
     formatted_output = '\n'.join([','.join(l) for l in output])
-    totals = map(str, ['Totals', total_tests, total_min, total_max,
-                       total_mean, '0', '0', '0'])
+    totals = ['Totals', str(len(driver.tests))]
     totals_output = '\n\n' + ','.join(totals)
-    if verbose:
-        if log_directory:
-            print(line_format.format(*([''] + totals)))
-        else:
-            print(totals_output[1:])
+    if log_directory:
+        print(line_format.format(*([''] + totals + ([''] * 6))))
+    else:
+        print(totals_output[1:])
     formatted_output += totals_output
     if log_directory:
-        log_results(log_directory, driver, formatted_output, swift_repo)
+        log_results(log_directory, driver.test_harness, formatted_output,
+                    swift_repo)
     return formatted_output
 
 
 def run(args):
-    driver = BenchmarkDriver(args)
     run_benchmarks(
-        driver.test_harness, benchmarks=driver.tests,
-        num_samples=args.iterations, verbose=True,
+        BenchmarkDriver(args),
         log_directory=args.output_dir,
         swift_repo=args.swift_repo)
     return 0
diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 2c686f4553181..c2d5388d93830 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -205,7 +205,7 @@ def __init__(self, csv_row):
 
         The row is an iterable, such as a row provided by the CSV parser.
         """
-        # csv_row[0] is just an ordinal number of the test - skip that
+        self.test_num = csv_row[0]      # Ordinal number of the test
         self.name = csv_row[1]          # Name of the performance test
         self.num_samples = (            # Number of measurement samples taken
             int(csv_row[2]))
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 3755da95d05fd..193ce3d815c73 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -18,7 +18,7 @@
 
 from imp import load_source
 
-from test_utils import Mock, captured_output
+from test_utils import Mock, Stub, captured_output
 from compare_perf_tests import PerformanceTestResult
 
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
@@ -91,6 +91,17 @@ def test_optimization_argument(self):
              "(choose from 'O', 'Onone', 'Osize')"],
             err.getvalue())
 
+    def test_iterations(self):
+        self.assertEquals(parse_args(['run']).iterations, 1)
+        self.assertEquals(parse_args(['run', '-i', '3']).iterations, 3)
+        with captured_output() as (out, err):
+            self.assertRaises(SystemExit,
+                              parse_args, ['run', '-i', '-3'])
+        self.assert_contains(
+            ['error:',
+             "argument -i/--iterations: invalid positive_int value: '-3'"],
+            err.getvalue())
+
 
 class ArgsStub(object):
     def __init__(self):
@@ -226,6 +237,32 @@ def test_measure_memory(self):
         self.subprocess_mock.assert_called_with(
             ('/benchmarks/Benchmark_O', 'b', '--memory'))
 
+    def test_run_benchmark_independent_samples(self):
+        self.driver.args.iterations = 3
+        r = self.driver.run_independent_samples('b1')
+        self.assertEquals(self.subprocess_mock.calls.count(
+            ('/benchmarks/Benchmark_O', 'b1', '--memory')), 3)
+        self.assertEquals(r.num_samples, 3)  # results are merged
+
+    def test_run_bechmarks(self):
+        def mock_run(test):
+            self.assertEquals(test, 'b1')
+            return PerformanceTestResult(
+                '3,b1,1,123,123,123,0,123,888'.split(','))
+        driver = Stub(tests=['b1'])
+        driver.run_independent_samples = mock_run
+        run_benchmarks = Benchmark_Driver.run_benchmarks
+
+        with captured_output() as (out, _):
+            run_benchmarks(driver)
+        self.assertEquals('\n'.join("""
+#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
+3,b1,1,123,123,123,0,123,888
+
+Totals,1
+
+""".splitlines()[1:]), out.getvalue())  # removes 1st \n from multiline string
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 6bfe7092011c8..f1f578cb54cbe 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -181,6 +181,7 @@ class TestPerformanceTestResult(unittest.TestCase):
     def test_init(self):
         log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
         r = PerformanceTestResult(log_line.split(','))
+        self.assertEquals(r.test_num, '1')
         self.assertEquals(r.name, 'AngryPhonebook')
         self.assertEquals(
             (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
@@ -346,7 +347,7 @@ def test_parse_results_formatted_text(self):
         log = ("""
 # TEST      SAMPLES MIN(μs) MAX(μs) MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)
 3 Array2D        20    2060    2188     2099      0       2099   20915200
-Totals        281 2693794 2882846  2748843      0          0          0""")
+Totals        281""")
         parser = LogParser()
         results = parser.parse_results(log.splitlines()[1:])  # without 1st \n
         self.assertTrue(isinstance(results[0], PerformanceTestResult))

From ab16999e2089ca13021fd83d31ea652e1f9ff47a Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Fri, 17 Aug 2018 08:28:49 +0200
Subject: [PATCH 17/20] [benchmark] Created BenchmarkDoctor (naming)

`BenchmarkDoctor` analyzes performance tests and reports their conformance to the set of desired criteria. First two rules verify the naming convention.

`BenchmarkDoctor` is invoked from `Benchmark_Driver` with `check` aurgument.
---
 benchmark/scripts/Benchmark_Driver         | 108 ++++++++++++++++
 benchmark/scripts/test_Benchmark_Driver.py | 143 ++++++++++++++++++++-
 benchmark/scripts/test_utils.py            |  20 +++
 3 files changed, 269 insertions(+), 2 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index e351d09fd210f..d1097ff903820 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -15,6 +15,7 @@
 
 import argparse
 import glob
+import logging
 import os
 import re
 import subprocess
@@ -127,6 +128,104 @@ class BenchmarkDriver(object):
                        for _ in range(self.args.iterations)])
 
 
+class LoggingReportFormatter(logging.Formatter):
+    """Format logs as plain text or with colors on the terminal.
+
+    Plain text outputs level, category and massage: 'DEBUG category: Hi!'
+    Colored output uses color coding based on the level.
+    """
+
+    import logging as log
+    colors = {log.DEBUG: '9', log.INFO: '2', log.WARNING: '3', log.ERROR: '1',
+              log.CRITICAL: '5'}
+
+    def __init__(self, use_color=False):
+        """Specify if report should use colors; defaults to False."""
+        super(LoggingReportFormatter, self).__init__('%(message)s')
+        self.use_color = use_color
+
+    def format(self, record):
+        """Format the log record with level and category."""
+        msg = super(LoggingReportFormatter, self).format(record)
+        category = ((record.name.split('.')[-1] + ': ') if '.' in record.name
+                    else '')
+        return ('\033[1;3{0}m{1}{2}\033[1;0m'.format(
+            self.colors[record.levelno], category, msg) if self.use_color else
+            '{0} {1}{2}'.format(record.levelname, category, msg))
+
+
+class BenchmarkDoctor(object):
+    """Checks that the benchmark conforms to the standard set of requirements.
+
+    Benchmarks that are part of Swift Benchmark Suite are required to follow
+    a set of rules that ensure quality measurements. These include naming
+    convention, robustness when varying execution parameters like
+    `num-iters` and `num-samples` (no setup overhead, constant memory
+    consumption).
+    """
+
+    log = logging.getLogger('BenchmarkDoctor')
+    log_naming = log.getChild('naming')
+    log_runtime = log.getChild('runtime')
+    log_memory = log.getChild('memory')
+    log.setLevel(logging.DEBUG)
+
+    def __init__(self, args, driver=None):
+        """Initialize with command line parameters.
+
+        Optional `driver` parameter for injecting dependency; used for testing.
+        """
+        super(BenchmarkDoctor, self).__init__()
+        self.driver = driver or BenchmarkDriver(args)
+        self.results = {}
+        self.console_handler = logging.StreamHandler(sys.stdout)
+        self.console_handler.setLevel(logging.DEBUG if args.verbose else
+                                      logging.INFO)
+        self.console_handler.setFormatter(
+            LoggingReportFormatter(use_color=sys.stdout.isatty()))
+        self.log.addHandler(self.console_handler)
+        self.log.debug('Checking tests: %s', ', '.join(self.driver.tests))
+
+    def __del__(self):
+        """Unregister handler on exit."""
+        self.log.removeHandler(self.console_handler)
+
+    capital_words_re = re.compile('[A-Z][a-zA-Z0-9]+')
+
+    @staticmethod
+    def _name_matches_capital_words_convention(test):
+        name = test
+        match = BenchmarkDoctor.capital_words_re.match(name)
+        matched = match.group(0) if match else ''
+        if name != matched:
+            BenchmarkDoctor.log_naming.error(
+                "'%s' name doesn't conform to UpperCamelCase convention.",
+                name)
+            BenchmarkDoctor.log_naming.info(
+                'See http://bit.ly/UpperCamelCase')
+
+    @staticmethod
+    def _name_is_at_most_40_chars_long(test):
+        if len(test) > 40:
+            BenchmarkDoctor.log_naming.warning(
+                "'%s' name is longer than 40 characters.", test)
+
+    def check(self):
+        checks = [
+            self._name_matches_capital_words_convention,
+            self._name_is_at_most_40_chars_long,
+            ]
+        for test in self.driver.tests:
+            for check in checks:
+                check(test)
+
+    @staticmethod
+    def run_check(args):
+        doctor = BenchmarkDoctor(args)
+        doctor.check()
+        return 0
+
+
 def get_current_git_branch(git_repo_path):
     """Return the selected branch for the repo `git_repo_path`"""
     return subprocess.check_output(
@@ -374,6 +473,15 @@ def parse_args(args):
         help='absolute path to the Swift source repository')
     run_parser.set_defaults(func=run)
 
+    check_parser = subparsers.add_parser(
+        'check',
+        help='',
+        parents=[shared_benchmarks_parser])
+    check_parser.add_argument(
+        '-v', '--verbose', action='store_true',
+        help='show more details during benchmark analysis',)
+    check_parser.set_defaults(func=BenchmarkDoctor.run_check)
+
     compare_parser = subparsers.add_parser(
         'compare',
         help='Compare benchmark results')
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 193ce3d815c73..327576e94ce09 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -13,14 +13,16 @@
 #
 # ===---------------------------------------------------------------------===//
 
+import logging
 import os
 import unittest
 
 from imp import load_source
 
-from test_utils import Mock, Stub, captured_output
 from compare_perf_tests import PerformanceTestResult
 
+from test_utils import Mock, MockLoggingHandler, Stub, captured_output
+
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
 Benchmark_Driver = load_source(
     'Benchmark_Driver', os.path.join(os.path.dirname(
@@ -28,6 +30,8 @@
 # from Benchmark_Driver import parse_args
 parse_args = Benchmark_Driver.parse_args
 BenchmarkDriver = Benchmark_Driver.BenchmarkDriver
+BenchmarkDoctor = Benchmark_Driver.BenchmarkDoctor
+LoggingReportFormatter = Benchmark_Driver.LoggingReportFormatter
 
 
 class Test_parse_args(unittest.TestCase):
@@ -45,7 +49,7 @@ def test_requires_command_argument(self):
     def test_command_help_lists_commands(self):
         with captured_output() as (out, _):
             self.assertRaises(SystemExit, parse_args, ['-h'])
-        self.assert_contains(['COMMAND', 'run', 'compare'],
+        self.assert_contains(['COMMAND', 'run', 'compare', 'check'],
                              out.getvalue())
 
     def test_run_benchmarks_by_name_or_ordinal(self):
@@ -102,6 +106,11 @@ def test_iterations(self):
              "argument -i/--iterations: invalid positive_int value: '-3'"],
             err.getvalue())
 
+    def test_check_supports_vebose_output(self):
+        self.assertFalse(parse_args(['check']).verbose)
+        self.assertTrue(parse_args(['check', '-v']).verbose)
+        self.assertTrue(parse_args(['check', '--verbose']).verbose)
+
 
 class ArgsStub(object):
     def __init__(self):
@@ -264,5 +273,135 @@ def mock_run(test):
 """.splitlines()[1:]), out.getvalue())  # removes 1st \n from multiline string
 
 
+class BenchmarkDriverMock(Mock):
+    """Mock for BenchmarkDriver's `run` method"""
+    def __init__(self, tests, responses=None):
+        super(BenchmarkDriverMock, self).__init__(responses)
+        self.tests = tests
+        self.args = ArgsStub()
+
+        def _run(test, num_samples=None, num_iters=None,
+                 verbose=None, measure_memory=False):
+            return self.record_and_respond(test, num_samples, num_iters,
+                                           verbose, measure_memory)
+        self.run = _run
+
+    def record_and_respond(self, test, num_samples, num_iters,
+                           verbose, measure_memory):
+        args = (test, num_samples, num_iters, verbose, measure_memory)
+        self.calls.append(args)
+        return self.respond.get(args, '')
+
+
+class TestLoggingReportFormatter(unittest.TestCase):
+    def test_plain_log_format(self):
+        lr = logging.makeLogRecord({
+            'name': 'Base.category', 'level': logging.DEBUG,
+            'levelname': 'DEBUG', 'msg': 'Hi!'})
+        f = LoggingReportFormatter()
+        self.assertEquals(f.format(lr), 'DEBUG category: Hi!')
+
+    def test_colored_log_format(self):
+        def record(level, level_name):
+            return logging.makeLogRecord({
+                'name': 'Base.category', 'levelno': level,
+                'levelname': level_name, 'msg': 'Hi!'})
+        f = LoggingReportFormatter(use_color=True)
+        self.assertEquals(f.format(record(logging.DEBUG, 'DEBUG')),
+                          '\x1b[1;39mcategory: Hi!\x1b[1;0m')
+        self.assertEquals(f.format(record(logging.INFO, 'INFO')),
+                          '\x1b[1;32mcategory: Hi!\x1b[1;0m')
+        self.assertEquals(f.format(record(logging.WARNING, 'WARNING')),
+                          '\x1b[1;33mcategory: Hi!\x1b[1;0m')
+        self.assertEquals(f.format(record(logging.ERROR, 'ERROR')),
+                          '\x1b[1;31mcategory: Hi!\x1b[1;0m')
+        self.assertEquals(f.format(record(logging.CRITICAL, 'CRITICAL')),
+                          '\x1b[1;35mcategory: Hi!\x1b[1;0m')
+
+    def test_no_prefix_for_base_logging(self):
+        lr = logging.makeLogRecord({
+            'name': 'Base', 'level': logging.INFO,
+            'levelname': 'INFO', 'msg': 'Hi!'})
+        f = LoggingReportFormatter()
+        self.assertEquals(f.format(lr), 'INFO Hi!')
+
+
+class TestBenchmarkDoctor(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super(TestBenchmarkDoctor, cls).setUpClass()
+        doctor_log = logging.getLogger('BenchmarkDoctor')
+        cls._doctor_log_handler = MockLoggingHandler(level='DEBUG')
+        doctor_log.addHandler(cls._doctor_log_handler)
+
+    def setUp(self):
+        super(TestBenchmarkDoctor, self).setUp()
+        self.args = Stub(verbose=False)
+        self._doctor_log_handler.reset()
+        self.logs = self._doctor_log_handler.messages
+
+    def assert_contains(self, texts, output):
+        assert not isinstance(texts, str)
+        for text in texts:
+            self.assertIn(text, output)
+
+    def test_uses_logging(self):
+        driver = BenchmarkDriverMock(tests=['B1', 'B2'])
+        with captured_output() as (out, _):
+            BenchmarkDoctor(self.args, driver)
+        self.assert_contains(['Checking tests: B1, B2'], self.logs['debug'])
+        self.assertEquals(out.getvalue(), '')
+
+    def test_supports_verbose_output(self):
+        driver = BenchmarkDriverMock(tests=['B1', 'B2'])
+        driver.verbose = True
+        with captured_output() as (out, _):
+            BenchmarkDoctor(Stub(verbose=True), driver)
+        self.assert_contains(['Checking tests: B1, B2'], out.getvalue())
+
+    def test_uses_report_formatter(self):
+        doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock(tests=['B1']))
+        console_handler = logging.getLogger('BenchmarkDoctor').handlers[1]
+        self.assertTrue(doctor)
+        self.assertTrue(isinstance(console_handler, logging.StreamHandler))
+        self.assertTrue(isinstance(console_handler.formatter,
+                                   LoggingReportFormatter))
+
+    def test_benchmark_name_matches_capital_words_conventions(self):
+        driver = BenchmarkDriverMock(tests=[
+            'BenchmarkName', 'CapitalWordsConvention', 'ABBRName',
+            'wrongCase', 'Wrong_convention'])
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, driver)
+            doctor.check()
+        output = out.getvalue()
+
+        self.assertIn('naming: ', output)
+        self.assertNotIn('BenchmarkName', output)
+        self.assertNotIn('CapitalWordsConvention', output)
+        self.assertNotIn('ABBRName', output)
+        self.assert_contains(
+            ["'wrongCase' name doesn't conform to UpperCamelCase convention.",
+             "'Wrong_convention' name doesn't conform to UpperCamelCase "
+             "convention."], self.logs['error'])
+        self.assert_contains(
+            ['See http://bit.ly/UpperCamelCase'], self.logs['info'])
+
+    def test_benchmark_name_is_at_most_40_chars_long(self):
+        driver = BenchmarkDriverMock(tests=[
+            'BenchmarkName',
+            'ThisTestNameIsTooLongAndCausesOverflowsInReports'])
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, driver)
+            doctor.check()
+        output = out.getvalue()
+
+        self.assertIn('naming: ', output)
+        self.assertNotIn('BenchmarkName', output)
+        self.assert_contains(
+            ["'ThisTestNameIsTooLongAndCausesOverflowsInReports' name is "
+             "longer than 40 characters."], output)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/benchmark/scripts/test_utils.py b/benchmark/scripts/test_utils.py
index 7112ca37ba3f1..c83d130b73d03 100644
--- a/benchmark/scripts/test_utils.py
+++ b/benchmark/scripts/test_utils.py
@@ -22,6 +22,7 @@
 common unit testing patterns that is used in this project.
 """
 
+import logging
 import sys
 
 from StringIO import StringIO
@@ -86,3 +87,22 @@ def assert_called_all_expected(self):
         assert self.calls == self.expected, (
             '\nExpected: {0}, \n  Called: {1}'.format(
                 self.expected, self.calls))
+
+
+class MockLoggingHandler(logging.Handler):
+    """Mock logging handler to check for expected logs."""
+
+    def __init__(self, *args, **kwargs):
+        """Prepare the logger for recording messages for each log level."""
+        self.reset()
+        super(MockLoggingHandler, self).__init__(*args, **kwargs)
+
+    def emit(self, record):
+        """Store the message in list for the given log level."""
+        self.messages[record.levelname.lower()].append(record.getMessage())
+
+    def reset(self):
+        """Clear all log messages."""
+        self.messages = {
+            'debug': [], 'info': [], 'warning': [], 'error': [], 'critical': []
+        }

From 7725c0096e6c2031f09c7f4dd4de5b9f1f90558c Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 16 Aug 2018 20:40:28 +0200
Subject: [PATCH 18/20] [benchmark] Measure and analyze benchmark runtimes

`BenchmarkDoctor` measures benchmark execution (using `BenchmarkDriver`) and verifies that their runtime stays under 2500 microseconds.
---
 benchmark/scripts/Benchmark_Driver         | 108 ++++++++++++++++++---
 benchmark/scripts/test_Benchmark_Driver.py |  84 +++++++++++++++-
 2 files changed, 178 insertions(+), 14 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index d1097ff903820..e8aa1ae9c04e8 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -16,6 +16,7 @@
 import argparse
 import glob
 import logging
+import math
 import os
 import re
 import subprocess
@@ -185,6 +186,11 @@ class BenchmarkDoctor(object):
             LoggingReportFormatter(use_color=sys.stdout.isatty()))
         self.log.addHandler(self.console_handler)
         self.log.debug('Checking tests: %s', ', '.join(self.driver.tests))
+        self.requirements = [
+            self._name_matches_capital_words_convention,
+            self._name_is_at_most_40_chars_long,
+            self._optimized_runtime_in_range,
+        ]
 
     def __del__(self):
         """Unregister handler on exit."""
@@ -193,10 +199,11 @@ class BenchmarkDoctor(object):
     capital_words_re = re.compile('[A-Z][a-zA-Z0-9]+')
 
     @staticmethod
-    def _name_matches_capital_words_convention(test):
-        name = test
+    def _name_matches_capital_words_convention(measurements):
+        name = measurements['name']
         match = BenchmarkDoctor.capital_words_re.match(name)
         matched = match.group(0) if match else ''
+
         if name != matched:
             BenchmarkDoctor.log_naming.error(
                 "'%s' name doesn't conform to UpperCamelCase convention.",
@@ -205,24 +212,101 @@ class BenchmarkDoctor(object):
                 'See http://bit.ly/UpperCamelCase')
 
     @staticmethod
-    def _name_is_at_most_40_chars_long(test):
-        if len(test) > 40:
-            BenchmarkDoctor.log_naming.warning(
-                "'%s' name is longer than 40 characters.", test)
+    def _name_is_at_most_40_chars_long(measurements):
+        name = measurements['name']
+
+        if len(name) > 40:
+            BenchmarkDoctor.log_naming.error(
+                "'%s' name is %d characters long.", name, len(name))
+            BenchmarkDoctor.log_naming.info(
+                'Benchmark name should not be longer than 40 characters.')
+
+    @staticmethod
+    def _select(measurements, num_iters=None, opt_level='O'):
+        prefix = measurements['name'] + ' ' + opt_level
+        prefix += '' if num_iters is None else (' i' + str(num_iters))
+        return [series for name, series in measurements.items()
+                if name.startswith(prefix)]
+
+    @staticmethod
+    def _optimized_runtime_in_range(measurements):
+        name = measurements['name']
+        runtime = min(BenchmarkDoctor._select(measurements),
+                      key=lambda r: r.min).min
+        if 2500 < runtime < 500000:
+            BenchmarkDoctor.log_runtime.warning(
+                "'%s' execution takes at least %d μs.", name, runtime)
+        elif 500000 <= runtime:
+            BenchmarkDoctor.log_runtime.error(
+                "'%s' execution takes at least %d μs.", name, runtime)
+        if 2500 < runtime:
+            factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))
+            BenchmarkDoctor.log_runtime.info(
+                "Decrease the workload of '%s' by a factor of %d, "
+                "to be less than 2500 μs.", name, factor)
+
+    @staticmethod
+    def _adjusted_1s_samples(runtime):
+        u"""Return sample count that can be taken in approximately 1 second.
+
+        Based on the runtime (μs) of one sample taken with num-iters=1.
+        """
+        if runtime == 0:
+            return 2
+        s = 1000000 / float(runtime)  # samples for 1s run
+        s = int(pow(2, round(math.log(s, 2))))  # rounding to power of 2
+        return s if s > 2 else 2  # always take at least 2 samples
+
+    def measure(self, benchmark):
+        """Measure benchmark with varying iterations and optimization levels.
+
+        Returns a dictionary with benchmark name and `PerformanceTestResult`s.
+        """
+        self.log.debug('Calibrating num-samples for {0}:'.format(benchmark))
+        r = self.driver.run(benchmark, num_samples=3, num_iters=1)  # calibrate
+        num_samples = self._adjusted_1s_samples(r.min)
+
+        def capped(s):
+            return min(s, 2048)
+        run_args = [(capped(num_samples), 1), (capped(num_samples / 2), 2)]
+        opts = self.driver.args.optimization
+        opts = opts if isinstance(opts, list) else [opts]
+        self.log.debug(
+            'Runtime {0} μs yields {1} adjusted samples per second.'.format(
+                r.min, num_samples))
+        self.log.debug(
+            'Measuring {0}, 5 x i1 ({1} samples), 5 x i2 ({2} samples)'.format(
+                benchmark, run_args[0][0], run_args[1][0]))
+
+        measurements = dict(
+            [('{0} {1} i{2}{3}'.format(benchmark, o, i, suffix),
+              self.driver.run(benchmark, num_samples=s, num_iters=i,
+                              measure_memory=True))
+             for o in opts
+             for s, i in run_args
+             for suffix in list('abcde')
+             ]
+        )
+        measurements['name'] = benchmark
+        return measurements
+
+    def analyze(self, benchmark_measurements):
+        """Analyze whether benchmark fullfills all requirtements."""
+        self.log.debug('Analyzing %s', benchmark_measurements['name'])
+        for rule in self.requirements:
+            rule(benchmark_measurements)
 
     def check(self):
-        checks = [
-            self._name_matches_capital_words_convention,
-            self._name_is_at_most_40_chars_long,
-            ]
+        """Measure and analyse all enabled tests."""
         for test in self.driver.tests:
-            for check in checks:
-                check(test)
+            self.analyze(self.measure(test))
 
     @staticmethod
     def run_check(args):
         doctor = BenchmarkDoctor(args)
         doctor.check()
+        # TODO non-zero error code when errors are logged
+        # See https://stackoverflow.com/a/31142078/41307
         return 0
 
 
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 327576e94ce09..a3a024e796ebd 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -290,7 +290,7 @@ def record_and_respond(self, test, num_samples, num_iters,
                            verbose, measure_memory):
         args = (test, num_samples, num_iters, verbose, measure_memory)
         self.calls.append(args)
-        return self.respond.get(args, '')
+        return self.respond.get(args, Stub(min=700))
 
 
 class TestLoggingReportFormatter(unittest.TestCase):
@@ -326,6 +326,13 @@ def test_no_prefix_for_base_logging(self):
         self.assertEquals(f.format(lr), 'INFO Hi!')
 
 
+def _run(test, num_samples=None, num_iters=None, verbose=None,
+         measure_memory=False):
+    """Helper function that constructs tuple with arguments for run method."""
+    return (
+        test, num_samples, num_iters, verbose, measure_memory)
+
+
 class TestBenchmarkDoctor(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -367,6 +374,41 @@ def test_uses_report_formatter(self):
         self.assertTrue(isinstance(console_handler.formatter,
                                    LoggingReportFormatter))
 
+    def test_measure_10_independent_1s_benchmark_series(self):
+        """Measurement strategy takes 5 i2 and 5 i1 series.
+
+        Num-samples for Benchmark Driver are calibrated to be powers of two,
+        take measurements for approximately 1s
+        based on short initial runtime sampling. Capped at 2k samples.
+        """
+        driver = BenchmarkDriverMock(tests=['B1'], responses=([
+            # calibration run, returns a stand-in for PerformanceTestResult
+            (_run('B1', num_samples=3, num_iters=1), Stub(min=300))] +
+            # 5x i1 series, with 300 μs runtime its possible to take 4098
+            # samples/s, but it should be capped at 2k
+            ([(_run('B1', num_samples=2048, num_iters=1,
+                    measure_memory=True), Stub(min=300))] * 5) +
+            # 5x i2 series
+            ([(_run('B1', num_samples=2048, num_iters=2,
+                    measure_memory=True), Stub(min=300))] * 5)
+        ))
+        doctor = BenchmarkDoctor(self.args, driver)
+        with captured_output() as (out, _):
+            measurements = doctor.measure('B1')
+
+        driver.assert_called_all_expected()
+        self.assert_contains(
+            ['name',
+             'B1 O i1a', 'B1 O i1b', 'B1 O i1c', 'B1 O i1d', 'B1 O i1e',
+             'B1 O i2a', 'B1 O i2b', 'B1 O i2c', 'B1 O i2d', 'B1 O i2e'],
+            measurements.keys())
+        self.assertEquals(measurements['name'], 'B1')
+        self.assert_contains(
+            ['Calibrating num-samples for B1:',
+             'Runtime 300 μs yields 4096 adjusted samples per second.',
+             'Measuring B1, 5 x i1 (2048 samples), 5 x i2 (2048 samples)'],
+            self.logs['debug'])
+
     def test_benchmark_name_matches_capital_words_conventions(self):
         driver = BenchmarkDriverMock(tests=[
             'BenchmarkName', 'CapitalWordsConvention', 'ABBRName',
@@ -400,7 +442,45 @@ def test_benchmark_name_is_at_most_40_chars_long(self):
         self.assertNotIn('BenchmarkName', output)
         self.assert_contains(
             ["'ThisTestNameIsTooLongAndCausesOverflowsInReports' name is "
-             "longer than 40 characters."], output)
+             "48 characters long."], self.logs['error'])
+        self.assert_contains(
+            ["Benchmark name should not be longer than 40 characters."],
+            self.logs['info'])
+
+    def test_benchmark_runtime_range(self):
+        """Optimized benchmark should run in less then 2500 μs.
+
+        With runtimes less than 2500 μs there is better than 1:4 chance of
+        being interrupted in the middle of measurement due to elapsed 10 ms
+        quantum used by macos scheduler.
+
+        Warn about longer runtime. Runtimes over half a second are an error.
+        """
+        def measurements(name, runtime):
+            return {'name': name,
+                    name + ' O i1a': Stub(min=runtime + 2),
+                    name + ' O i2a': Stub(min=runtime)}
+
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze(measurements('Cheetah', 200))
+            doctor.analyze(measurements('Hare', 2501))
+            doctor.analyze(measurements('Tortoise', 500000))
+        output = out.getvalue()
+
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('Cheetah', output)
+        self.assert_contains(["'Hare' execution takes at least 2501 μs."],
+                             self.logs['warning'])
+        self.assert_contains(
+            ["Decrease the workload of 'Hare' by a factor of 2, "
+             "to be less than 2500 μs."], self.logs['info'])
+        self.assert_contains(
+            ["'Tortoise' execution takes at least 500000 μs."],
+            self.logs['error'])
+        self.assert_contains(
+            ["Decrease the workload of 'Tortoise' by a factor of 256, "
+             "to be less than 2500 μs."], self.logs['info'])
 
 
 if __name__ == '__main__':

From 06061976da26d5b9bb91a1160207949837c383d5 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Fri, 17 Aug 2018 08:50:04 +0200
Subject: [PATCH 19/20] [benchmark] BenchmarkDoctor checks setup overhead

Detect setup overhead in benchmark and report if it exceeds 5%.
---
 benchmark/scripts/Benchmark_Driver         | 42 +++++++++++++++++-----
 benchmark/scripts/test_Benchmark_Driver.py | 36 +++++++++++++++++--
 2 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index e8aa1ae9c04e8..87cf1461085bd 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -189,6 +189,7 @@ class BenchmarkDoctor(object):
         self.requirements = [
             self._name_matches_capital_words_convention,
             self._name_is_at_most_40_chars_long,
+            self._no_setup_overhead,
             self._optimized_runtime_in_range,
         ]
 
@@ -231,20 +232,45 @@ class BenchmarkDoctor(object):
     @staticmethod
     def _optimized_runtime_in_range(measurements):
         name = measurements['name']
-        runtime = min(BenchmarkDoctor._select(measurements),
-                      key=lambda r: r.min).min
-        if 2500 < runtime < 500000:
-            BenchmarkDoctor.log_runtime.warning(
-                "'%s' execution takes at least %d μs.", name, runtime)
-        elif 500000 <= runtime:
-            BenchmarkDoctor.log_runtime.error(
-                "'%s' execution takes at least %d μs.", name, runtime)
+        setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
+        setup = 0 if ratio < 0.05 else setup
+        runtime = min(
+            [(result.min - correction) for i_series in
+             [BenchmarkDoctor._select(measurements, num_iters=i)
+              for correction in [(setup / i) for i in [1, 2]]
+              ] for result in i_series])
+
         if 2500 < runtime:
+            log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else
+                   BenchmarkDoctor.log_runtime.error)
+            caveat = '' if setup == 0 else ' (excluding the setup overhead)'
+            log("'%s' execution took at least %d μs%s.", name, runtime, caveat)
             factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))
             BenchmarkDoctor.log_runtime.info(
                 "Decrease the workload of '%s' by a factor of %d, "
                 "to be less than 2500 μs.", name, factor)
 
+    @staticmethod
+    def _setup_overhead(measurements):
+        select = BenchmarkDoctor._select
+        ti1, ti2 = [float(min(mins)) for mins in
+                    [[result.min for result in i_series] for i_series in
+                     [select(measurements, num_iters=i) for i in [1, 2]]]]
+        setup = int(round(2.0 * (ti1 - ti2)))
+        ratio = (setup / ti1) if ti1 > 0 else 0
+        return (setup, ratio)
+
+    @staticmethod
+    def _no_setup_overhead(measurements):
+        setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
+        if ratio > 0.05:
+            BenchmarkDoctor.log_runtime.error(
+                "'%s' has setup overhead of %d μs (%.1f%%).",
+                measurements['name'], setup, round((100 * ratio), 1))
+            BenchmarkDoctor.log_runtime.info(
+                'Move initialization of benchmark data to the `setUpFunction` '
+                'registered in `BenchmarkInfo`.')
+
     @staticmethod
     def _adjusted_1s_samples(runtime):
         u"""Return sample count that can be taken in approximately 1 second.
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index a3a024e796ebd..1fd29dfe63671 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -466,21 +466,53 @@ def measurements(name, runtime):
             doctor.analyze(measurements('Cheetah', 200))
             doctor.analyze(measurements('Hare', 2501))
             doctor.analyze(measurements('Tortoise', 500000))
+            doctor.analyze({'name': 'OverheadTurtle',
+                            'OverheadTurtle O i1a': Stub(min=800000),
+                            'OverheadTurtle O i2a': Stub(min=700000)})
         output = out.getvalue()
 
         self.assertIn('runtime: ', output)
         self.assertNotIn('Cheetah', output)
-        self.assert_contains(["'Hare' execution takes at least 2501 μs."],
+        self.assert_contains(["'Hare' execution took at least 2501 μs."],
                              self.logs['warning'])
         self.assert_contains(
             ["Decrease the workload of 'Hare' by a factor of 2, "
              "to be less than 2500 μs."], self.logs['info'])
         self.assert_contains(
-            ["'Tortoise' execution takes at least 500000 μs."],
+            ["'Tortoise' execution took at least 500000 μs."],
             self.logs['error'])
         self.assert_contains(
             ["Decrease the workload of 'Tortoise' by a factor of 256, "
              "to be less than 2500 μs."], self.logs['info'])
+        self.assert_contains(
+            ["'OverheadTurtle' execution took at least 600000 μs"
+             " (excluding the setup overhead)."],
+            self.logs['error'])
+
+    def test_benchmark_has_no_significant_setup_overhead(self):
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze({
+                'name': 'NoOverhead',  # not 'significant' enough
+                # Based on DropFirstArray a10/e10: overhead 3.7% (6 μs)
+                'NoOverhead O i1a': Stub(min=162),
+                'NoOverhead O i2a': Stub(min=159)})
+            doctor.analyze({
+                'name': 'SO',  # Setup Overhead
+                # Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs)
+                'SO O i1a': Stub(min=69), 'SO O i1b': Stub(min=70),
+                'SO O i2a': Stub(min=67), 'SO O i2b': Stub(min=68)})
+            # TODO tests with TypeFlood (0 runtime)
+        output = out.getvalue()
+
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('NoOverhead', output)
+        self.assert_contains(
+            ["'SO' has setup overhead of 4 μs (5.8%)."],
+            self.logs['error'])
+        self.assert_contains(
+            ["Move initialization of benchmark data to the `setUpFunction` "
+             "registered in `BenchmarkInfo`."], self.logs['info'])
 
 
 if __name__ == '__main__':

From f38e6df914db58251002bf272cb03f3566d9906a Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Fri, 17 Aug 2018 08:51:46 +0200
Subject: [PATCH 20/20] [benchmark] Doctor verifies constant memory use

This needs to be finished with function approximating normal range based on the memory used.
---
 benchmark/scripts/Benchmark_Driver         | 29 ++++++++-
 benchmark/scripts/test_Benchmark_Driver.py | 74 ++++++++++++++++++----
 2 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index 87cf1461085bd..eeac28fc226b5 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -191,6 +191,7 @@ class BenchmarkDoctor(object):
             self._name_is_at_most_40_chars_long,
             self._no_setup_overhead,
             self._optimized_runtime_in_range,
+            self._constant_memory_use
         ]
 
     def __del__(self):
@@ -271,6 +272,32 @@ class BenchmarkDoctor(object):
                 'Move initialization of benchmark data to the `setUpFunction` '
                 'registered in `BenchmarkInfo`.')
 
+    @staticmethod
+    def _constant_memory_use(measurements):
+        select = BenchmarkDoctor._select
+        (min_i1, max_i1), (min_i2, max_i2) = [
+            (min(memory_use), max(memory_use)) for memory_use in
+            [[r.mem_pages for r in i_series] for i_series in
+             [select(measurements, num_iters=i) for i in
+              [1, 2]]]]
+        range_i1, range_i2 = max_i1 - min_i1, max_i2 - min_i2
+        normal_range = 15  # pages
+        name = measurements['name']
+
+        if abs(min_i1 - min_i2) > max(range_i1, range_i2, normal_range):
+            BenchmarkDoctor.log_memory.error(
+                "'%s' varies the memory footprint of the base "
+                "workload depending on the `num-iters`.", name)
+
+        if max(range_i1, range_i2) > normal_range:
+            BenchmarkDoctor.log_memory.warning(
+                "'%s' has very wide range of memory used between "
+                "independent, repeated measurements.", name)
+
+        BenchmarkDoctor.log_memory.debug(
+            "%s mem_pages [i1, i2]:  min=[%d, %d] 𝚫=%d R=[%d, %d]", name,
+            *[min_i1, min_i2, abs(min_i1 - min_i2), range_i1, range_i2])
+
     @staticmethod
     def _adjusted_1s_samples(runtime):
         u"""Return sample count that can be taken in approximately 1 second.
@@ -307,7 +334,7 @@ class BenchmarkDoctor(object):
         measurements = dict(
             [('{0} {1} i{2}{3}'.format(benchmark, o, i, suffix),
               self.driver.run(benchmark, num_samples=s, num_iters=i,
-                              measure_memory=True))
+                              verbose=True))
              for o in opts
              for s, i in run_args
              for suffix in list('abcde')
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 1fd29dfe63671..7bebca0847407 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -290,7 +290,7 @@ def record_and_respond(self, test, num_samples, num_iters,
                            verbose, measure_memory):
         args = (test, num_samples, num_iters, verbose, measure_memory)
         self.calls.append(args)
-        return self.respond.get(args, Stub(min=700))
+        return self.respond.get(args, _PTR(min=700))
 
 
 class TestLoggingReportFormatter(unittest.TestCase):
@@ -326,6 +326,11 @@ def test_no_prefix_for_base_logging(self):
         self.assertEquals(f.format(lr), 'INFO Hi!')
 
 
+def _PTR(min=700, mem_pages=1000):
+    """Create PerformanceTestResult Stub."""
+    return Stub(min=min, mem_pages=mem_pages)
+
+
 def _run(test, num_samples=None, num_iters=None, verbose=None,
          measure_memory=False):
     """Helper function that constructs tuple with arguments for run method."""
@@ -383,14 +388,14 @@ def test_measure_10_independent_1s_benchmark_series(self):
         """
         driver = BenchmarkDriverMock(tests=['B1'], responses=([
             # calibration run, returns a stand-in for PerformanceTestResult
-            (_run('B1', num_samples=3, num_iters=1), Stub(min=300))] +
+            (_run('B1', num_samples=3, num_iters=1), _PTR(min=300))] +
             # 5x i1 series, with 300 μs runtime its possible to take 4098
             # samples/s, but it should be capped at 2k
             ([(_run('B1', num_samples=2048, num_iters=1,
-                    measure_memory=True), Stub(min=300))] * 5) +
+                    verbose=True), _PTR(min=300))] * 5) +
             # 5x i2 series
             ([(_run('B1', num_samples=2048, num_iters=2,
-                    measure_memory=True), Stub(min=300))] * 5)
+                    verbose=True), _PTR(min=300))] * 5)
         ))
         doctor = BenchmarkDoctor(self.args, driver)
         with captured_output() as (out, _):
@@ -458,8 +463,8 @@ def test_benchmark_runtime_range(self):
         """
         def measurements(name, runtime):
             return {'name': name,
-                    name + ' O i1a': Stub(min=runtime + 2),
-                    name + ' O i2a': Stub(min=runtime)}
+                    name + ' O i1a': _PTR(min=runtime + 2),
+                    name + ' O i2a': _PTR(min=runtime)}
 
         with captured_output() as (out, _):
             doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
@@ -467,8 +472,8 @@ def measurements(name, runtime):
             doctor.analyze(measurements('Hare', 2501))
             doctor.analyze(measurements('Tortoise', 500000))
             doctor.analyze({'name': 'OverheadTurtle',
-                            'OverheadTurtle O i1a': Stub(min=800000),
-                            'OverheadTurtle O i2a': Stub(min=700000)})
+                            'OverheadTurtle O i1a': _PTR(min=800000),
+                            'OverheadTurtle O i2a': _PTR(min=700000)})
         output = out.getvalue()
 
         self.assertIn('runtime: ', output)
@@ -495,18 +500,20 @@ def test_benchmark_has_no_significant_setup_overhead(self):
             doctor.analyze({
                 'name': 'NoOverhead',  # not 'significant' enough
                 # Based on DropFirstArray a10/e10: overhead 3.7% (6 μs)
-                'NoOverhead O i1a': Stub(min=162),
-                'NoOverhead O i2a': Stub(min=159)})
+                'NoOverhead O i1a': _PTR(min=162),
+                'NoOverhead O i2a': _PTR(min=159)})
             doctor.analyze({
                 'name': 'SO',  # Setup Overhead
                 # Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs)
-                'SO O i1a': Stub(min=69), 'SO O i1b': Stub(min=70),
-                'SO O i2a': Stub(min=67), 'SO O i2b': Stub(min=68)})
-            # TODO tests with TypeFlood (0 runtime)
+                'SO O i1a': _PTR(min=69), 'SO O i1b': _PTR(min=70),
+                'SO O i2a': _PTR(min=67), 'SO O i2b': _PTR(min=68)})
+            doctor.analyze({'name': 'Zero', 'Zero O i1a': _PTR(min=0),
+                            'Zero O i2a': _PTR(min=0)})
         output = out.getvalue()
 
         self.assertIn('runtime: ', output)
         self.assertNotIn('NoOverhead', output)
+        self.assertNotIn('ZeroRuntime', output)
         self.assert_contains(
             ["'SO' has setup overhead of 4 μs (5.8%)."],
             self.logs['error'])
@@ -514,6 +521,47 @@ def test_benchmark_has_no_significant_setup_overhead(self):
             ["Move initialization of benchmark data to the `setUpFunction` "
              "registered in `BenchmarkInfo`."], self.logs['info'])
 
+    def test_benchmark_has_constant_memory_use(self):
+        """Benchmark's memory footprint must not vary with num-iters."""
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze({
+                # The threshold of 15 pages was estimated from previous
+                # measurements. The normal range should be probably aproximated
+                # by a function instead of a simple constant.
+                # TODO: re-evaluate normal range from whole SBS
+                'name': 'ConstantMemory',
+                'ConstantMemory O i1a': _PTR(mem_pages=1460),
+                'ConstantMemory O i2a': _PTR(mem_pages=(1460 + 15))})
+            doctor.analyze({
+                'name': 'VariableMemory',  # ObserverForwardStruct
+                'VariableMemory O i1a': _PTR(mem_pages=1460),
+                'VariableMemory O i1b': _PTR(mem_pages=1472),
+                # i2 series start at 290 pages higher
+                'VariableMemory O i2a': _PTR(mem_pages=1750),
+                'VariableMemory O i2b': _PTR(mem_pages=1752)})
+            measurements = dict([
+                ('HighVariance O i{0}{1}'.format(num_iters, suffix),
+                 _PTR(mem_pages=num_pages))
+                for num_iters, pages in [
+                    (1, [6200, 5943, 4818, 5612, 5469]),
+                    (2, [6244, 5832, 4674, 5176, 5490])]
+                for num_pages, suffix in zip(pages, list('abcde'))])
+            measurements['name'] = 'HighVariance'  # Array2D
+            doctor.analyze(measurements)
+        output = out.getvalue()
+
+        self.assertIn('memory: ', output)
+        self.assertNotIn('ConstantMemory', output)
+        self.assert_contains(
+            ["'VariableMemory' varies the memory footprint of the base "
+             "workload depending on the `num-iters`."],
+            self.logs['error'])
+        self.assert_contains(
+            ["'HighVariance' has very wide range of memory used between "
+             "independent, repeated measurements."],
+            self.logs['warning'])
+
 
 if __name__ == '__main__':
     unittest.main()