diff --git a/.gitignore b/.gitignore
index 74aa31bc37df7..05e03732a5215 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,3 +53,4 @@ compile_commands.json
 SortedCFDatabase.def
 htmlcov
 .coverage
+/benchmark/scripts/Benchmark_Driverc
diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index 92f12c627855a..eeac28fc226b5 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -14,136 +14,353 @@
 # ===---------------------------------------------------------------------===//
 
 import argparse
-import datetime
 import glob
-import json
+import logging
+import math
 import os
 import re
 import subprocess
 import sys
 import time
-import urllib
-import urllib2
+
+from compare_perf_tests import LogParser
 
 DRIVER_DIR = os.path.dirname(os.path.realpath(__file__))
 
 
-def parse_results(res, optset):
-    # Parse lines like this
-    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),PEAK_MEMORY(B)
-    score_re = re.compile(r"(\d+),[ \t]*(\w+)," +
-                          ",".join([r"[ \t]*([\d.]+)"] * 7))
-    # The Totals line would be parsed like this.
-    total_re = re.compile(r"()(Totals)," +
-                          ",".join([r"[ \t]*([\d.]+)"] * 7))
-    key_group = 2
-    val_group = 4
-    mem_group = 9
-
-    tests = []
-    for line in res.split():
-        m = score_re.match(line)
-        if not m:
-            m = total_re.match(line)
-            if not m:
-                continue
-        testresult = int(m.group(val_group))
-        testname = m.group(key_group)
-        test = {}
-        test['Data'] = [testresult]
-        test['Info'] = {}
-        test['Name'] = "nts.swift/" + optset + "." + testname + ".exec"
-        tests.append(test)
-        if testname != 'Totals':
-            mem_testresult = int(m.group(mem_group))
-            mem_test = {}
-            mem_test['Data'] = [mem_testresult]
-            mem_test['Info'] = {}
-            mem_test['Name'] = "nts.swift/mem_maxrss." + \
-                optset + "." + testname + ".mem"
-            tests.append(mem_test)
-    return tests
-
-
-def submit_to_lnt(data, url):
-    print("\nSubmitting results to LNT server...")
-    json_report = {'input_data': json.dumps(data), 'commit': '1'}
-    data = urllib.urlencode(json_report)
-    response_str = urllib2.urlopen(urllib2.Request(url, data))
-    response = json.loads(response_str.read())
-    if 'success' in response:
-        print("Server response:\tSuccess")
-    else:
-        print("Server response:\tError")
-        print("Error:\t", response['error'])
-        sys.exit(1)
-
-
-def instrument_test(driver_path, test, num_samples):
-    """Run a test and instrument its peak memory use"""
-    test_outputs = []
-    for _ in range(num_samples):
-        test_output_raw = subprocess.check_output(
-            ['time', '-lp', driver_path, test],
-            stderr=subprocess.STDOUT
-        )
-        peak_memory = re.match('\s*(\d+)\s*maximum resident set size',
-                               test_output_raw.split('\n')[-15]).group(1)
-        test_outputs.append(test_output_raw.split()[1].split(',') +
-                            [peak_memory])
-
-    # Average sample results
-    num_samples_index = 2
-    min_index = 3
-    max_index = 4
-    avg_start_index = 5
-
-    # TODO: Correctly take stdev
-    avg_test_output = test_outputs[0]
-    avg_test_output[avg_start_index:] = map(int,
-                                            avg_test_output[avg_start_index:])
-    for test_output in test_outputs[1:]:
-        for i in range(avg_start_index, len(test_output)):
-            avg_test_output[i] += int(test_output[i])
-    for i in range(avg_start_index, len(avg_test_output)):
-        avg_test_output[i] = int(round(avg_test_output[i] /
-                                       float(len(test_outputs))))
-    avg_test_output[num_samples_index] = num_samples
-    avg_test_output[min_index] = min(
-        test_outputs, key=lambda x: int(x[min_index]))[min_index]
-    avg_test_output[max_index] = max(
-        test_outputs, key=lambda x: int(x[max_index]))[max_index]
-    avg_test_output = map(str, avg_test_output)
-
-    return avg_test_output
-
-
-def get_tests(driver_path, args):
-    """Return a list of available performance tests"""
-    driver = ([driver_path, '--list'])
-    # Use tab delimiter for easier parsing to override the default comma.
-    # (The third 'column' is always comma-separated list of tags in square
-    # brackets -- currently unused here.)
-    driver.append('--delim=\t')
-    if args.benchmarks or args.filters:
-        driver.append('--skip-tags=')  # list all tests, don't skip any tags
-    index_name_pairs = [
-        line.split('\t')[:2] for line in
-        subprocess.check_output(driver).split('\n')[1:-1]
-    ]
-    indices, names = zip(*index_name_pairs)  # unzip list of pairs into 2 lists
-    if args.filters:
-        regexes = [re.compile(pattern) for pattern in args.filters]
+class BenchmarkDriver(object):
+    """Executes tests from Swift Benchmark Suite."""
+
+    def __init__(self, args, tests=None, _subprocess=None, parser=None):
+        """Initialized with command line arguments.
+
+        Optional parameters for injecting dependencies; used for testing.
+        """
+        self.args = args
+        self._subprocess = _subprocess or subprocess
+        self.all_tests = []
+        self.tests = tests or self._get_tests()
+        self.parser = parser or LogParser()
+        self.results = {}
+
+    def _invoke(self, cmd):
+        return self._subprocess.check_output(
+            cmd, stderr=self._subprocess.STDOUT)
+
+    @property
+    def test_harness(self):
+        """Full path to test harness binary."""
+        suffix = (self.args.optimization if hasattr(self.args, 'optimization')
+                  else 'O')
+        return os.path.join(self.args.tests, "Benchmark_" + suffix)
+
+    @property
+    def _cmd_list_benchmarks(self):
+        # Use tab delimiter for easier parsing to override the default comma.
+        # (The third 'column' is always comma-separated list of tags in square
+        # brackets -- currently unused here.)
+        return [self.test_harness, '--list', '--delim=\t'] + (
+            ['--skip-tags='] if (self.args.benchmarks or
+                                 self.args.filters) else [])
+
+    def _get_tests(self):
+        """Return a list of performance tests to run."""
+        index_name_pairs = [
+            line.split('\t')[:2] for line in
+            self._invoke(self._cmd_list_benchmarks).split('\n')[1:-1]
+        ]
+        # unzip list of pairs into 2 lists
+        indices, self.all_tests = map(list, zip(*index_name_pairs))
+        if self.args.filters:
+            return self._tests_matching_patterns()
+        if self.args.benchmarks:
+            return self._tests_by_name_or_index(indices)
+        return self.all_tests
+
+    def _tests_matching_patterns(self):
+        regexes = [re.compile(pattern) for pattern in self.args.filters]
         return sorted(list(set([name for pattern in regexes
-                                for name in names if pattern.match(name)])))
-    if not args.benchmarks:
-        return names
-    benchmarks = set(args.benchmarks)
-    index_to_name = dict(index_name_pairs)
-    indexed_names = [index_to_name[i]
-                     for i in benchmarks.intersection(set(indices))]
-    return sorted(list(
-        benchmarks.intersection(set(names)).union(indexed_names)))
+                                for name in self.all_tests
+                                if pattern.match(name)])))
+
+    def _tests_by_name_or_index(self, indices):
+        benchmarks = set(self.args.benchmarks)
+        index_to_name = dict(zip(indices, self.all_tests))
+        indexed_names = [index_to_name[i]
+                         for i in benchmarks.intersection(set(indices))]
+        return sorted(list(
+            benchmarks.intersection(set(self.all_tests)).union(indexed_names)))
+
+    def run(self, test, num_samples=None, num_iters=None,
+            verbose=None, measure_memory=False):
+        """Execute benchmark and gather results."""
+        num_samples = num_samples or 1
+        num_iters = num_iters or 0  # automatically determine N to run for 1s
+
+        cmd = self._cmd_run(
+            test, num_samples, num_iters, verbose, measure_memory)
+        output = self._invoke(cmd)
+        result = self.parser.results_from_string(output).items()[0][1]
+        return result
+
+    def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory):
+        cmd = [self.test_harness, test]
+        if num_samples > 1:
+            cmd.append('--num-samples={0}'.format(num_samples))
+        if num_iters > 0:
+            cmd.append('--num-iters={0}'.format(num_iters))
+        if verbose:
+            cmd.append('--verbose')
+        if measure_memory:
+            cmd.append('--memory')
+        return cmd
+
+    def run_independent_samples(self, test):
+        """Run benchmark multiple times, gathering independent samples.
+
+        Returns the aggregated result of independent benchmark invocations.
+        """
+        def merge_results(a, b):
+            a.merge(b)
+            return a
+
+        return reduce(merge_results,
+                      [self.run(test, measure_memory=True)
+                       for _ in range(self.args.iterations)])
+
+
+class LoggingReportFormatter(logging.Formatter):
+    """Format logs as plain text or with colors on the terminal.
+
+    Plain text outputs level, category and massage: 'DEBUG category: Hi!'
+    Colored output uses color coding based on the level.
+    """
+
+    import logging as log
+    colors = {log.DEBUG: '9', log.INFO: '2', log.WARNING: '3', log.ERROR: '1',
+              log.CRITICAL: '5'}
+
+    def __init__(self, use_color=False):
+        """Specify if report should use colors; defaults to False."""
+        super(LoggingReportFormatter, self).__init__('%(message)s')
+        self.use_color = use_color
+
+    def format(self, record):
+        """Format the log record with level and category."""
+        msg = super(LoggingReportFormatter, self).format(record)
+        category = ((record.name.split('.')[-1] + ': ') if '.' in record.name
+                    else '')
+        return ('\033[1;3{0}m{1}{2}\033[1;0m'.format(
+            self.colors[record.levelno], category, msg) if self.use_color else
+            '{0} {1}{2}'.format(record.levelname, category, msg))
+
+
+class BenchmarkDoctor(object):
+    """Checks that the benchmark conforms to the standard set of requirements.
+
+    Benchmarks that are part of Swift Benchmark Suite are required to follow
+    a set of rules that ensure quality measurements. These include naming
+    convention, robustness when varying execution parameters like
+    `num-iters` and `num-samples` (no setup overhead, constant memory
+    consumption).
+    """
+
+    log = logging.getLogger('BenchmarkDoctor')
+    log_naming = log.getChild('naming')
+    log_runtime = log.getChild('runtime')
+    log_memory = log.getChild('memory')
+    log.setLevel(logging.DEBUG)
+
+    def __init__(self, args, driver=None):
+        """Initialize with command line parameters.
+
+        Optional `driver` parameter for injecting dependency; used for testing.
+        """
+        super(BenchmarkDoctor, self).__init__()
+        self.driver = driver or BenchmarkDriver(args)
+        self.results = {}
+        self.console_handler = logging.StreamHandler(sys.stdout)
+        self.console_handler.setLevel(logging.DEBUG if args.verbose else
+                                      logging.INFO)
+        self.console_handler.setFormatter(
+            LoggingReportFormatter(use_color=sys.stdout.isatty()))
+        self.log.addHandler(self.console_handler)
+        self.log.debug('Checking tests: %s', ', '.join(self.driver.tests))
+        self.requirements = [
+            self._name_matches_capital_words_convention,
+            self._name_is_at_most_40_chars_long,
+            self._no_setup_overhead,
+            self._optimized_runtime_in_range,
+            self._constant_memory_use
+        ]
+
+    def __del__(self):
+        """Unregister handler on exit."""
+        self.log.removeHandler(self.console_handler)
+
+    capital_words_re = re.compile('[A-Z][a-zA-Z0-9]+')
+
+    @staticmethod
+    def _name_matches_capital_words_convention(measurements):
+        name = measurements['name']
+        match = BenchmarkDoctor.capital_words_re.match(name)
+        matched = match.group(0) if match else ''
+
+        if name != matched:
+            BenchmarkDoctor.log_naming.error(
+                "'%s' name doesn't conform to UpperCamelCase convention.",
+                name)
+            BenchmarkDoctor.log_naming.info(
+                'See http://bit.ly/UpperCamelCase')
+
+    @staticmethod
+    def _name_is_at_most_40_chars_long(measurements):
+        name = measurements['name']
+
+        if len(name) > 40:
+            BenchmarkDoctor.log_naming.error(
+                "'%s' name is %d characters long.", name, len(name))
+            BenchmarkDoctor.log_naming.info(
+                'Benchmark name should not be longer than 40 characters.')
+
+    @staticmethod
+    def _select(measurements, num_iters=None, opt_level='O'):
+        prefix = measurements['name'] + ' ' + opt_level
+        prefix += '' if num_iters is None else (' i' + str(num_iters))
+        return [series for name, series in measurements.items()
+                if name.startswith(prefix)]
+
+    @staticmethod
+    def _optimized_runtime_in_range(measurements):
+        name = measurements['name']
+        setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
+        setup = 0 if ratio < 0.05 else setup
+        runtime = min(
+            [(result.min - correction) for i_series in
+             [BenchmarkDoctor._select(measurements, num_iters=i)
+              for correction in [(setup / i) for i in [1, 2]]
+              ] for result in i_series])
+
+        if 2500 < runtime:
+            log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else
+                   BenchmarkDoctor.log_runtime.error)
+            caveat = '' if setup == 0 else ' (excluding the setup overhead)'
+            log("'%s' execution took at least %d μs%s.", name, runtime, caveat)
+            factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))
+            BenchmarkDoctor.log_runtime.info(
+                "Decrease the workload of '%s' by a factor of %d, "
+                "to be less than 2500 μs.", name, factor)
+
+    @staticmethod
+    def _setup_overhead(measurements):
+        select = BenchmarkDoctor._select
+        ti1, ti2 = [float(min(mins)) for mins in
+                    [[result.min for result in i_series] for i_series in
+                     [select(measurements, num_iters=i) for i in [1, 2]]]]
+        setup = int(round(2.0 * (ti1 - ti2)))
+        ratio = (setup / ti1) if ti1 > 0 else 0
+        return (setup, ratio)
+
+    @staticmethod
+    def _no_setup_overhead(measurements):
+        setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
+        if ratio > 0.05:
+            BenchmarkDoctor.log_runtime.error(
+                "'%s' has setup overhead of %d μs (%.1f%%).",
+                measurements['name'], setup, round((100 * ratio), 1))
+            BenchmarkDoctor.log_runtime.info(
+                'Move initialization of benchmark data to the `setUpFunction` '
+                'registered in `BenchmarkInfo`.')
+
+    @staticmethod
+    def _constant_memory_use(measurements):
+        select = BenchmarkDoctor._select
+        (min_i1, max_i1), (min_i2, max_i2) = [
+            (min(memory_use), max(memory_use)) for memory_use in
+            [[r.mem_pages for r in i_series] for i_series in
+             [select(measurements, num_iters=i) for i in
+              [1, 2]]]]
+        range_i1, range_i2 = max_i1 - min_i1, max_i2 - min_i2
+        normal_range = 15  # pages
+        name = measurements['name']
+
+        if abs(min_i1 - min_i2) > max(range_i1, range_i2, normal_range):
+            BenchmarkDoctor.log_memory.error(
+                "'%s' varies the memory footprint of the base "
+                "workload depending on the `num-iters`.", name)
+
+        if max(range_i1, range_i2) > normal_range:
+            BenchmarkDoctor.log_memory.warning(
+                "'%s' has very wide range of memory used between "
+                "independent, repeated measurements.", name)
+
+        BenchmarkDoctor.log_memory.debug(
+            "%s mem_pages [i1, i2]:  min=[%d, %d] 𝚫=%d R=[%d, %d]", name,
+            *[min_i1, min_i2, abs(min_i1 - min_i2), range_i1, range_i2])
+
+    @staticmethod
+    def _adjusted_1s_samples(runtime):
+        u"""Return sample count that can be taken in approximately 1 second.
+
+        Based on the runtime (μs) of one sample taken with num-iters=1.
+        """
+        if runtime == 0:
+            return 2
+        s = 1000000 / float(runtime)  # samples for 1s run
+        s = int(pow(2, round(math.log(s, 2))))  # rounding to power of 2
+        return s if s > 2 else 2  # always take at least 2 samples
+
+    def measure(self, benchmark):
+        """Measure benchmark with varying iterations and optimization levels.
+
+        Returns a dictionary with benchmark name and `PerformanceTestResult`s.
+        """
+        self.log.debug('Calibrating num-samples for {0}:'.format(benchmark))
+        r = self.driver.run(benchmark, num_samples=3, num_iters=1)  # calibrate
+        num_samples = self._adjusted_1s_samples(r.min)
+
+        def capped(s):
+            return min(s, 2048)
+        run_args = [(capped(num_samples), 1), (capped(num_samples / 2), 2)]
+        opts = self.driver.args.optimization
+        opts = opts if isinstance(opts, list) else [opts]
+        self.log.debug(
+            'Runtime {0} μs yields {1} adjusted samples per second.'.format(
+                r.min, num_samples))
+        self.log.debug(
+            'Measuring {0}, 5 x i1 ({1} samples), 5 x i2 ({2} samples)'.format(
+                benchmark, run_args[0][0], run_args[1][0]))
+
+        measurements = dict(
+            [('{0} {1} i{2}{3}'.format(benchmark, o, i, suffix),
+              self.driver.run(benchmark, num_samples=s, num_iters=i,
+                              verbose=True))
+             for o in opts
+             for s, i in run_args
+             for suffix in list('abcde')
+             ]
+        )
+        measurements['name'] = benchmark
+        return measurements
+
+    def analyze(self, benchmark_measurements):
+        """Analyze whether benchmark fullfills all requirtements."""
+        self.log.debug('Analyzing %s', benchmark_measurements['name'])
+        for rule in self.requirements:
+            rule(benchmark_measurements)
+
+    def check(self):
+        """Measure and analyse all enabled tests."""
+        for test in self.driver.tests:
+            self.analyze(self.measure(test))
+
+    @staticmethod
+    def run_check(args):
+        doctor = BenchmarkDoctor(args)
+        doctor.check()
+        # TODO non-zero error code when errors are logged
+        # See https://stackoverflow.com/a/31142078/41307
+        return 0
 
 
 def get_current_git_branch(git_repo_path):
@@ -190,11 +407,10 @@ def log_results(log_directory, driver, formatted_output, swift_repo=None):
         f.write(formatted_output)
 
 
-def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False,
+def run_benchmarks(driver,
                    log_directory=None, swift_repo=None):
     """Run perf tests individually and return results in a format that's
-    compatible with `parse_results`. If `benchmarks` is not empty,
-    only run tests included in it.
+    compatible with `LogParser`.
     """
     # Set a constant hash seed. Some tests are currently sensitive to
     # fluctuations in the number of hash collisions.
@@ -203,88 +419,43 @@ def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False,
     # that runs the tests.
     os.environ["SWIFT_DETERMINISTIC_HASHING"] = "1"
 
-    (total_tests, total_min, total_max, total_mean) = (0, 0, 0, 0)
     output = []
     headings = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)', 'MEAN(μs)',
                 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)']
     line_format = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}'
-    if verbose and log_directory:
+    if log_directory:
         print(line_format.format(*headings))
-    for test in benchmarks:
-        test_output = instrument_test(driver, test, num_samples)
-        if test_output[0] == 'Totals':
-            continue
-        if verbose:
-            if log_directory:
-                print(line_format.format(*test_output))
-            else:
-                print(','.join(test_output))
+    else:
+        print(','.join(headings))
+    for test in driver.tests:
+        r = driver.run_independent_samples(test)
+        test_output = map(str, [
+            r.test_num, r.name, r.num_samples, r.min, r.max, int(r.mean),
+            int(r.sd), r.median, r.max_rss])
+        if log_directory:
+            print(line_format.format(*test_output))
+        else:
+            print(','.join(test_output))
         output.append(test_output)
-        (samples, _min, _max, mean) = map(int, test_output[2:6])
-        total_tests += 1
-        total_min += _min
-        total_max += _max
-        total_mean += mean
     if not output:
         return
     formatted_output = '\n'.join([','.join(l) for l in output])
-    totals = map(str, ['Totals', total_tests, total_min, total_max,
-                       total_mean, '0', '0', '0'])
+    totals = ['Totals', str(len(driver.tests))]
     totals_output = '\n\n' + ','.join(totals)
-    if verbose:
-        if log_directory:
-            print(line_format.format(*([''] + totals)))
-        else:
-            print(totals_output[1:])
+    if log_directory:
+        print(line_format.format(*([''] + totals + ([''] * 6))))
+    else:
+        print(totals_output[1:])
     formatted_output += totals_output
     if log_directory:
-        log_results(log_directory, driver, formatted_output, swift_repo)
+        log_results(log_directory, driver.test_harness, formatted_output,
+                    swift_repo)
     return formatted_output
 
 
-def submit(args):
-    print("SVN revision:\t", args.revision)
-    print("Machine name:\t", args.machine)
-    print("Iterations:\t", args.iterations)
-    print("Optimizations:\t", ','.join(args.optimization))
-    print("LNT host:\t", args.lnt_host)
-    starttime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
-    print("Start time:\t", starttime)
-    data = {}
-    data['Tests'] = []
-    data['Machine'] = {'Info': {'name': args.machine}, 'Name': args.machine}
-    print("\nRunning benchmarks...")
-    for optset in args.optimization:
-        print("Opt level:\t", optset)
-        file = os.path.join(args.tests, "Benchmark_" + optset)
-        try:
-            res = run_benchmarks(
-                file, benchmarks=get_tests(file, args),
-                num_samples=args.iterations)
-            data['Tests'].extend(parse_results(res, optset))
-        except subprocess.CalledProcessError as e:
-            print("Execution failed.. Test results are empty.")
-            print("Process output:\n", e.output)
-
-    endtime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
-    data['Run'] = {'End Time': endtime,
-                   'Info': {'inferred_run_order': str(args.revision),
-                            'run_order': str(args.revision),
-                            'tag': 'nts',
-                            'test_suite_revision': 'None'},
-                   'Start Time': starttime}
-    print("End time:\t", endtime)
-
-    submit_to_lnt(data, args.lnt_host)
-    return 0
-
-
 def run(args):
-    optset = args.optimization
-    file = os.path.join(args.tests, "Benchmark_" + optset)
     run_benchmarks(
-        file, benchmarks=get_tests(file, args),
-        num_samples=args.iterations, verbose=True,
+        BenchmarkDriver(args),
         log_directory=args.output_dir,
         swift_repo=args.swift_repo)
     return 0
@@ -392,16 +563,17 @@ def positive_int(value):
     return ivalue
 
 
-def main():
+def parse_args(args):
+    """Parse command line arguments and set default values."""
     parser = argparse.ArgumentParser(
         epilog='Example: ./Benchmark_Driver run -i 5 -f Prefix -f .*Suffix.*'
     )
     subparsers = parser.add_subparsers(
         title='Swift benchmark driver commands',
-        help='See COMMAND -h for additional arguments', metavar='<command>')
+        help='See COMMAND -h for additional arguments', metavar='COMMAND')
 
-    parent_parser = argparse.ArgumentParser(add_help=False)
-    benchmarks_group = parent_parser.add_mutually_exclusive_group()
+    shared_benchmarks_parser = argparse.ArgumentParser(add_help=False)
+    benchmarks_group = shared_benchmarks_parser.add_mutually_exclusive_group()
     benchmarks_group.add_argument(
         'benchmarks',
         default=[],
@@ -410,45 +582,22 @@ def main():
         '-f', '--filter', dest='filters', action='append',
         help='run all tests whose name match regular expression PATTERN, ' +
         'multiple filters are supported', metavar="PATTERN")
-    parent_parser.add_argument(
+    shared_benchmarks_parser.add_argument(
         '-t', '--tests',
         help='directory containing Benchmark_O{,none,size} ' +
         '(default: DRIVER_DIR)',
         default=DRIVER_DIR)
-
-    submit_parser = subparsers.add_parser(
-        'submit',
-        help='Run benchmarks and submit results to LNT',
-        parents=[parent_parser])
-    submit_parser.add_argument(
-        '-o', '--optimization', nargs='+',
-        help='optimization levels to use (default: O Onone Osize)',
-        default=['O', 'Onone', 'Osize'])
-    submit_parser.add_argument(
-        '-i', '--iterations',
-        help='number of times to run each test (default: 10)',
-        type=positive_int, default=10)
-    submit_parser.add_argument(
-        '-m', '--machine', required=True,
-        help='LNT machine name')
-    submit_parser.add_argument(
-        '-r', '--revision', required=True,
-        help='SVN revision of compiler to identify the LNT run', type=int)
-    submit_parser.add_argument(
-        '-l', '--lnt_host', required=True,
-        help='LNT host to submit results to')
-    submit_parser.set_defaults(func=submit)
-
-    run_parser = subparsers.add_parser(
-        'run',
-        help='Run benchmarks and output results to stdout',
-        parents=[parent_parser])
-    run_parser.add_argument(
+    shared_benchmarks_parser.add_argument(
         '-o', '--optimization',
         metavar='OPT',
         choices=['O', 'Onone', 'Osize'],
         help='optimization level to use: {O,Onone,Osize}, (default: O)',
         default='O')
+
+    run_parser = subparsers.add_parser(
+        'run',
+        help='Run benchmarks and output results to stdout',
+        parents=[shared_benchmarks_parser])
     run_parser.add_argument(
         '-i', '--iterations',
         help='number of times to run each test (default: 1)',
@@ -458,9 +607,18 @@ def main():
         help='log results to directory (default: no logging)')
     run_parser.add_argument(
         '--swift-repo',
-        help='absolute path to Swift source repo for branch comparison')
+        help='absolute path to the Swift source repository')
     run_parser.set_defaults(func=run)
 
+    check_parser = subparsers.add_parser(
+        'check',
+        help='',
+        parents=[shared_benchmarks_parser])
+    check_parser.add_argument(
+        '-v', '--verbose', action='store_true',
+        help='show more details during benchmark analysis',)
+    check_parser.set_defaults(func=BenchmarkDoctor.run_check)
+
     compare_parser = subparsers.add_parser(
         'compare',
         help='Compare benchmark results')
@@ -469,7 +627,7 @@ def main():
         help='directory containing benchmark logs')
     compare_parser.add_argument(
         '--swift-repo', required=True,
-        help='absolute path to Swift source repo')
+        help='absolute path to the Swift source repository')
     compare_parser.add_argument(
         '--compare-script', required=True,
         help='absolute path to compare script')
@@ -479,9 +637,11 @@ def main():
              'branch (default: master)')
     compare_parser.set_defaults(func=compare)
 
-    args = parser.parse_args()
-    if args.func != compare and isinstance(args.optimization, list):
-        args.optimization = sorted(list(set(args.optimization)))
+    return parser.parse_args(args)
+
+
+def main():
+    args = parse_args(sys.argv[1:])
     return args.func(args)
 
 
diff --git a/benchmark/scripts/CMakeLists.txt b/benchmark/scripts/CMakeLists.txt
index 0fa44ab38d750..4922f6dd84031 100644
--- a/benchmark/scripts/CMakeLists.txt
+++ b/benchmark/scripts/CMakeLists.txt
@@ -42,3 +42,8 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/Benchmark_Driver
      DESTINATION "${swift-bin-dir}"
      FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ
      GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/compare_perf_tests.py
+     DESTINATION "${swift-bin-dir}"
+     FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ
+     GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 2865df59bd116..c2d5388d93830 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -12,61 +12,155 @@
 #  See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 #
 # ===---------------------------------------------------------------------===//
+"""
+This script is used for comparing performance test results.
 
+It is structured into several classes that can be imported into other modules.
+"""
 from __future__ import print_function
 
 import argparse
-import csv
+import re
 import sys
+from bisect import bisect, bisect_left, bisect_right
+from collections import namedtuple
 from math import sqrt
 
 
-class PerformanceTestResult(object):
-    """PerformanceTestResult holds results from executing an individual
-    benchmark from the Swift Benchmark Suite as reported by the test driver
-    (Benchmark_O, Benchmark_Onone, Benchmark_Osize or Benchmark_Driver).
+class Sample(namedtuple('Sample', 'i num_iters runtime')):
+    u"""Single benchmark measurement.
 
-    It depends on the log format emitted by the test driver in the form:
-    #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
-
-    The last column, MAX_RSS, is emitted only for runs instrumented by the
-    Benchmark_Driver to measure rough memory use during the execution of the
-    benchmark.
+    Initialized with:
+    `i`: ordinal number of the sample taken,
+    `num-num_iters`:  number or iterations used to compute it,
+    `runtime`: in microseconds (μs).
     """
-    def __init__(self, csv_row):
-        """PerformanceTestResult instance is created from an iterable with
-        length of 8 or 9. (Like a row provided by the CSV parser.)
-        """
-        # csv_row[0] is just an ordinal number of the test - skip that
-        self.name = csv_row[1]          # Name of the performance test
-        self.samples = int(csv_row[2])  # Number of measurement samples taken
-        self.min = int(csv_row[3])      # Minimum runtime (ms)
-        self.max = int(csv_row[4])      # Maximum runtime (ms)
-        self.mean = int(csv_row[5])     # Mean (average) runtime (ms)
-        sd = int(csv_row[6])            # Standard Deviation (ms)
-        # For computing running variance
-        self.S_runtime = (0 if self.samples < 2 else
-                          (sd * sd) * (self.samples - 1))
-        self.median = int(csv_row[7])   # Median runtime (ms)
-        self.max_rss = (                # Maximum Resident Set Size (B)
-            int(csv_row[8]) if len(csv_row) > 8 else None)
 
     def __repr__(self):
+        """Shorter Sample formating for debugging purposes."""
+        return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)
+
+
+class PerformanceTestSamples(object):
+    """Collection of runtime samples from the benchmark execution.
+
+    Computes the sample population statistics.
+    """
+
+    def __init__(self, name, samples=None):
+        """Initialized with benchmark name and optional list of Samples."""
+        self.name = name  # Name of the performance test
+        self.samples = []
+        self.outliers = []
+        self._runtimes = []
+        self.mean = 0.0
+        self.S_runtime = 0.0  # For computing running variance
+        for sample in samples or []:
+            self.add(sample)
+
+    def __str__(self):
+        """Text summary of benchmark statisctics."""
         return (
-            '<PerformanceTestResult name:{0.name!r} '
-            'samples:{0.samples!r} min:{0.min!r} max:{0.max!r} '
-            'mean:{0.mean!r} sd:{0.sd!r} median:{0.median!r}>'.format(self))
+            '{0.name!s} n={0.count!r} '
+            'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
+            'Max={0.max!r} '
+            'R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} '
+            'Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}'
+            .format(self) if self.samples else
+            '{0.name!s} n=0'.format(self))
+
+    def add(self, sample):
+        """Add sample to collection and recompute statistics."""
+        assert isinstance(sample, Sample)
+        self._update_stats(sample)
+        i = bisect(self._runtimes, sample.runtime)
+        self._runtimes.insert(i, sample.runtime)
+        self.samples.insert(i, sample)
+
+    def _update_stats(self, sample):
+        old_stats = (self.count, self.mean, self.S_runtime)
+        _, self.mean, self.S_runtime = (
+            self.running_mean_variance(old_stats, sample.runtime))
+
+    def exclude_outliers(self, top_only=False):
+        """Exclude outliers by applying Interquartile Range Rule.
+
+        Moves the samples outside of the inner fences
+        (Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes
+        statistics for the remaining sample population. Optionally apply
+        only the top inner fence, preserving the small outliers.
+
+        Experimentally, this rule seems to perform well-enough on the
+        benchmark runtimes in the microbenchmark range to filter out
+        the environment noise caused by preemtive multitasking.
+        """
+        lo = (0 if top_only else
+              bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr)))
+        hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
+
+        outliers = self.samples[:lo] + self.samples[hi:]
+        samples = self.samples[lo:hi]
+
+        self.__init__(self.name)  # re-initialize
+        for sample in samples:  # and
+            self.add(sample)  # re-compute stats
+        self.outliers = outliers
+
+    @property
+    def count(self):
+        """Number of samples used to compute the statistics."""
+        return len(self.samples)
+
+    @property
+    def num_samples(self):
+        """Number of all samples in the collection."""
+        return len(self.samples) + len(self.outliers)
+
+    @property
+    def all_samples(self):
+        """List of all samples in ascending order."""
+        return sorted(self.samples + self.outliers, key=lambda s: s.i)
+
+    @property
+    def min(self):
+        """Minimum sampled value."""
+        return self.samples[0].runtime
+
+    @property
+    def max(self):
+        """Maximum sampled value."""
+        return self.samples[-1].runtime
+
+    @property
+    def median(self):
+        """Median sampled value."""
+        return self.samples[self.count / 2].runtime
+
+    @property
+    def q1(self):
+        """First Quartile (25th Percentile)."""
+        return self.samples[self.count / 4].runtime
+
+    @property
+    def q3(self):
+        """Third Quartile (75th Percentile)."""
+        return self.samples[(self.count / 2) + (self.count / 4)].runtime
+
+    @property
+    def iqr(self):
+        """Interquartile Range."""
+        return self.q3 - self.q1
 
     @property
     def sd(self):
-        """Standard Deviation (ms)"""
-        return (0 if self.samples < 2 else
-                sqrt(self.S_runtime / (self.samples - 1)))
+        u"""Standard Deviation (μs)."""
+        return (0 if self.count < 2 else
+                sqrt(self.S_runtime / (self.count - 1)))
 
     @staticmethod
     def running_mean_variance((k, M_, S_), x):
-        """
-        Compute running variance, B. P. Welford's method
+        """Compute running variance, B. P. Welford's method.
+
         See Knuth TAOCP vol 2, 3rd edition, page 232, or
         https://www.johndcook.com/blog/standard_deviation/
         M is mean, Standard Deviation is defined as sqrt(S/k-1)
@@ -76,50 +170,98 @@ def running_mean_variance((k, M_, S_), x):
         S = S_ + (x - M_) * (x - M)
         return (k, M, S)
 
-    def merge(self, r):
-        """Merging test results recomputes min and max.
-        It attempts to recompute mean and standard deviation when all_samples
-        are available. There is no correct way to compute these values from
-        test results that are summaries from more than 3 samples.
+    @property
+    def cv(self):
+        """Coeficient of Variation (%)."""
+        return (self.sd / self.mean) if self.mean else 0
 
-        The use case here is comparing tests results parsed from concatenated
-        log files from multiple runs of benchmark driver.
-        """
-        self.min = min(self.min, r.min)
-        self.max = max(self.max, r.max)
-        # self.median = None # unclear what to do here
+    @property
+    def range(self):
+        """Range of samples values (Max - Min)."""
+        return self.max - self.min
 
-        def push(x):
-            state = (self.samples, self.mean, self.S_runtime)
-            state = self.running_mean_variance(state, x)
-            (self.samples, self.mean, self.S_runtime) = state
+    @property
+    def spread(self):
+        """Sample Spread; i.e. Range as (%) of Min."""
+        return self.range / float(self.min) if self.min else 0
 
-        # Merging test results with up to 3 samples is exact
-        values = [r.min, r.max, r.median][:min(r.samples, 3)]
-        map(push, values)
 
-    # Column labels for header row in results table
-    header = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
+class PerformanceTestResult(object):
+    u"""Result from executing an individual Swift Benchmark Suite benchmark.
+
+    Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
+    or Benchmark_Driver).
+
+    It depends on the log format emitted by the test driver in the form:
+    #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
 
-    def values(self):
-        """Values property for display in results table comparisons
-        in format: ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS').
+    The last column, MAX_RSS, is emitted only for runs instrumented by the
+    Benchmark_Driver to measure rough memory use during the execution of the
+    benchmark.
+    """
+
+    def __init__(self, csv_row):
+        """Initialized from a row with 8 or 9 columns with benchmark summary.
+
+        The row is an iterable, such as a row provided by the CSV parser.
         """
+        self.test_num = csv_row[0]      # Ordinal number of the test
+        self.name = csv_row[1]          # Name of the performance test
+        self.num_samples = (            # Number of measurement samples taken
+            int(csv_row[2]))
+        self.min = int(csv_row[3])      # Minimum runtime (μs)
+        self.max = int(csv_row[4])      # Maximum runtime (μs)
+        self.mean = float(csv_row[5])   # Mean (average) runtime (μs)
+        self.sd = float(csv_row[6])     # Standard Deviation (μs)
+        self.median = int(csv_row[7])   # Median runtime (μs)
+        self.max_rss = (                # Maximum Resident Set Size (B)
+            int(csv_row[8]) if len(csv_row) > 8 else None)
+        self.samples = None
+
+    def __repr__(self):
+        """Short summary for debugging purposes."""
         return (
-            self.name,
-            str(self.min), str(self.max), str(int(self.mean)),
-            str(self.max_rss) if self.max_rss else '—'
-        )
+            '<PerformanceTestResult name:{0.name!r} '
+            'samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} '
+            'mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>'
+            .format(self))
+
+    def merge(self, r):
+        """Merge two results.
+
+        Recomputes min, max and mean statistics. If all `samples` are
+        avaliable, it recomputes all the statistics.
+        The use case here is comparing test results parsed from concatenated
+        log files from multiple runs of benchmark driver.
+        """
+        if self.samples and r.samples:
+            map(self.samples.add, r.samples.samples)
+            sams = self.samples
+            self.num_samples = sams.num_samples
+            self.min, self.max, self.median, self.mean, self.sd = \
+                sams.min, sams.max, sams.median, sams.mean, sams.sd
+        else:
+            self.min = min(self.min, r.min)
+            self.max = max(self.max, r.max)
+            self.mean = (  # pooled mean is the weighted sum of means
+                (self.mean * self.num_samples) + (r.mean * r.num_samples)
+            ) / float(self.num_samples + r.num_samples)
+            self.num_samples += r.num_samples
+            self.max_rss = min(self.max_rss, r.max_rss)
+            self.median, self.sd = 0, 0
 
 
 class ResultComparison(object):
     """ResultComparison compares MINs from new and old PerformanceTestResult.
+
     It computes speedup ratio and improvement delta (%).
     """
+
     def __init__(self, old, new):
+        """Initialize with old and new `PerformanceTestResult`s to compare."""
         self.old = old
         self.new = new
-        assert(old.name == new.name)
+        assert old.name == new.name
         self.name = old.name  # Test name, convenience accessor
 
         # Speedup ratio
@@ -129,56 +271,149 @@ def __init__(self, old, new):
         ratio = (new.min + 0.001) / (old.min + 0.001)
         self.delta = ((ratio - 1) * 100)
 
-        # Add ' (?)' to the speedup column as indication of dubious changes:
-        # result's MIN falls inside the (MIN, MAX) interval of result they are
-        # being compared with.
-        self.is_dubious = (
-            ' (?)' if ((old.min < new.min and new.min < old.max) or
-                       (new.min < old.min and old.min < new.max))
-            else '')
+        # Indication of dubious changes: when result's MIN falls inside the
+        # (MIN, MAX) interval of result they are being compared with.
+        self.is_dubious = ((old.min < new.min and new.min < old.max) or
+                           (new.min < old.min and old.min < new.max))
+
+
+class LogParser(object):
+    """Converts log outputs into `PerformanceTestResult`s.
+
+    Supports various formats produced by the `Benchmark_Driver` and
+    `Benchmark_O`('Onone', 'Osize'). It can also merge together the
+    results from concatenated log files.
+    """
+
+    def __init__(self):
+        """Create instance of `LogParser`."""
+        self.results = []
+        self._reset()
+
+    def _reset(self):
+        """Reset parser to the default state for reading a new result."""
+        self.samples, self.num_iters = [], 1
+        self.max_rss, self.mem_pages = None, None
+        self.voluntary_cs, self.involuntary_cs = None, None
+
+    # Parse lines like this
+    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
+    results_re = re.compile(r'(\d+[, \t]*\w+[, \t]*' +
+                            r'[, \t]*'.join([r'[\d.]+'] * 6) +
+                            r'[, \t]*[\d.]*)')  # optional MAX_RSS(B)
+
+    def _append_result(self, result):
+        columns = result.split(',')
+        if len(columns) < 8:
+            columns = result.split()
+        r = PerformanceTestResult(columns)
+        if self.max_rss:
+            r.max_rss = self.max_rss
+            r.mem_pages = self.mem_pages
+            r.voluntary_cs = self.voluntary_cs
+            r.involuntary_cs = self.involuntary_cs
+        if self.samples:
+            r.samples = PerformanceTestSamples(r.name, self.samples)
+            r.samples.exclude_outliers()
+        self.results.append(r)
+        self._reset()
+
+    def _store_memory_stats(self, max_rss, mem_pages):
+        self.max_rss = int(max_rss)
+        self.mem_pages = int(mem_pages)
+
+    # Regular expression and action to take when it matches the parsed line
+    state_actions = {
+        results_re: _append_result,
+
+        # Verbose mode adds new productions:
+        # Adaptively determined N; test loop multiple adjusting runtime to ~1s
+        re.compile(r'\s+Measuring with scale (\d+).'):
+        (lambda self, num_iters: setattr(self, 'num_iters', num_iters)),
+
+        re.compile(r'\s+Sample (\d+),(\d+)'):
+        (lambda self, i, runtime:
+         self.samples.append(
+             Sample(int(i), int(self.num_iters), int(runtime)))),
+
+        # Environmental statistics: memory usage and context switches
+        re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
+        _store_memory_stats,
+
+        re.compile(r'\s+VCS \d+ - \d+ = (\d+)'):
+        (lambda self, vcs: setattr(self, 'voluntary_cs', int(vcs))),
+
+        re.compile(r'\s+ICS \d+ - \d+ = (\d+)'):
+        (lambda self, ics: setattr(self, 'involuntary_cs', int(ics))),
+    }
+
+    def parse_results(self, lines):
+        """Parse results from the lines of the log output from Benchmark*.
 
-    # Column labels for header row in results table
-    header = ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP')
+        Returns a list of `PerformanceTestResult`s.
+        """
+        for line in lines:
+            for regexp, action in LogParser.state_actions.items():
+                match = regexp.match(line)
+                if match:
+                    action(self, *match.groups())
+                    break  # stop after 1st match
+            else:  # If none matches, skip the line.
+                # print('skipping: ' + line.rstrip('\n'))
+                continue
+        return self.results
+
+    @staticmethod
+    def _results_from_lines(lines):
+        tests = LogParser().parse_results(lines)
+
+        def add_or_merge(names, r):
+            if r.name not in names:
+                names[r.name] = r
+            else:
+                names[r.name].merge(r)
+            return names
+
+        return reduce(add_or_merge, tests, dict())
+
+    @staticmethod
+    def results_from_string(log_contents):
+        """Parse `PerformanceTestResult`s from the supplied string.
 
-    def values(self):
-        """Values property for display in results table comparisons
-        in format: ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP').
+        Returns dictionary of test names and `PerformanceTestResult`s.
         """
-        return (self.name,
-                str(self.old.min), str(self.new.min),
-                '{0:+.1f}%'.format(self.delta),
-                '{0:.2f}x{1}'.format(self.ratio, self.is_dubious))
+        return LogParser._results_from_lines(log_contents.splitlines())
+
+    @staticmethod
+    def results_from_file(log_file):
+        """Parse `PerformanceTestResult`s from the log file.
+
+        Returns dictionary of test names and `PerformanceTestResult`s.
+        """
+        with open(log_file) as f:
+            return LogParser._results_from_lines(f.readlines())
 
 
 class TestComparator(object):
-    """TestComparator parses `PerformanceTestResult`s from CSV log files.
-    Then it determines which tests were `added`, `removed` and which can be
+    """Analyzes changes betweeen the old and new test results.
+
+    It determines which tests were `added`, `removed` and which can be
     compared. It then splits the `ResultComparison`s into 3 groups according to
     the `delta_threshold` by the change in performance: `increased`,
-    `descreased` and `unchanged`.
+    `descreased` and `unchanged`. Whole computaion is performed during
+    initialization and results are provided as properties on this object.
 
     The lists of `added`, `removed` and `unchanged` tests are sorted
     alphabetically. The `increased` and `decreased` lists are sorted in
     descending order by the amount of change.
     """
-    def __init__(self, old_file, new_file, delta_threshold):
-
-        def load_from_CSV(filename):  # handles output from Benchmark_O and
-            def skip_totals(row):     # Benchmark_Driver (added MAX_RSS column)
-                return len(row) > 7 and row[0].isdigit()
-            tests = map(PerformanceTestResult,
-                        filter(skip_totals, csv.reader(open(filename))))
-
-            def add_or_merge(names, r):
-                if r.name not in names:
-                    names[r.name] = r
-                else:
-                    names[r.name].merge(r)
-                return names
-            return reduce(add_or_merge, tests, dict())
-
-        old_results = load_from_CSV(old_file)
-        new_results = load_from_CSV(new_file)
+
+    def __init__(self, old_results, new_results, delta_threshold):
+        """Initialized with dictionaries of old and new benchmark results.
+
+        Dictionary keys are benchmark names, values are
+        `PerformanceTestResult`s.
+        """
         old_tests = set(old_results.keys())
         new_tests = set(new_results.keys())
         comparable_tests = new_tests.intersection(old_tests)
@@ -215,13 +450,17 @@ def partition(l, p):
 
 
 class ReportFormatter(object):
-    """ReportFormatter formats the `PerformanceTestResult`s and
-    `ResultComparison`s provided by `TestComparator` using their `header` and
-    `values()` into report table. Supported formats are: `markdown` (used for
-    displaying benchmark results on GitHub), `git` and `html`.
+    """Creates the report from perfromance test comparison in specified format.
+
+    `ReportFormatter` formats the `PerformanceTestResult`s and
+    `ResultComparison`s provided by `TestComparator` into report table.
+    Supported formats are: `markdown` (used for displaying benchmark results on
+    GitHub), `git` and `html`.
     """
+
     def __init__(self, comparator, old_branch, new_branch, changes_only,
                  single_table=False):
+        """Initialize with `TestComparator` and names of branches."""
         self.comparator = comparator
         self.old_branch = old_branch
         self.new_branch = new_branch
@@ -237,13 +476,45 @@ def __init__(self, comparator, old_branch, new_branch, changes_only,
     GIT_DETAIL = """
 {0} ({1}): {2}"""
 
+    PERFORMANCE_TEST_RESULT_HEADER = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
+    RESULT_COMPARISON_HEADER = ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP')
+
+    @staticmethod
+    def header_for(result):
+        """Column labels for header row in results table."""
+        return (ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER
+                if isinstance(result, PerformanceTestResult) else
+                # isinstance(result, ResultComparison)
+                ReportFormatter.RESULT_COMPARISON_HEADER)
+
+    @staticmethod
+    def values(result):
+        """Format values from PerformanceTestResult or ResultComparison.
+
+        Returns tuple of strings to display in the results table.
+        """
+        return (
+            (result.name,
+             str(result.min), str(result.max), str(int(result.mean)),
+             str(result.max_rss) if result.max_rss else '—')
+            if isinstance(result, PerformanceTestResult) else
+            # isinstance(result, ResultComparison)
+            (result.name,
+             str(result.old.min), str(result.new.min),
+             '{0:+.1f}%'.format(result.delta),
+             '{0:.2f}x{1}'.format(result.ratio,
+                                  ' (?)' if result.is_dubious else ''))
+        )
+
     def markdown(self):
+        """Report results of benchmark comparisons in Markdown format."""
         return self._formatted_text(
             ROW='{0} | {1} | {2} | {3} | {4} \n',
             HEADER_SEPARATOR='---',
             DETAIL=self.MARKDOWN_DETAIL)
 
     def git(self):
+        """Report results of benchmark comparisons in 'git' format."""
         return self._formatted_text(
             ROW='{0}   {1}   {2}   {3}   {4} \n',
             HEADER_SEPARATOR='   ',
@@ -251,14 +522,15 @@ def git(self):
 
     def _column_widths(self):
         changed = self.comparator.decreased + self.comparator.increased
-        comparisons = (changed if self.changes_only else
-                       changed + self.comparator.unchanged)
-        comparisons += self.comparator.added + self.comparator.removed
+        results = (changed if self.changes_only else
+                   changed + self.comparator.unchanged)
+        results += self.comparator.added + self.comparator.removed
 
         widths = [
             map(len, columns) for columns in
-            [PerformanceTestResult.header, ResultComparison.header] +
-            [c.values() for c in comparisons]
+            [ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER,
+             ReportFormatter.RESULT_COMPARISON_HEADER] +
+            [ReportFormatter.values(r) for r in results]
         ]
 
         def max_widths(maximum, widths):
@@ -285,8 +557,8 @@ def format_columns(r, strong):
 
         def table(title, results, is_strong=False, is_open=False):
             rows = [
-                row(format_columns(result_comparison.values(), is_strong))
-                for result_comparison in results
+                row(format_columns(ReportFormatter.values(r), is_strong))
+                for r in results
             ]
             if not rows:
                 return ''
@@ -294,7 +566,7 @@ def table(title, results, is_strong=False, is_open=False):
             if self.single_table:
                 t = ''
                 if not self.header_printed:
-                    t += header(results[0].header)
+                    t += header(ReportFormatter.header_for(results[0]))
                     self.header_printed = True
                 t += row(('**' + title + '**', '', '', '', ''))
                 t += ''.join(rows)
@@ -303,7 +575,8 @@ def table(title, results, is_strong=False, is_open=False):
             return DETAIL.format(
                 *[
                     title, len(results),
-                    (header(results[0].header) + ''.join(rows)),
+                    (header(ReportFormatter.header_for(results[0])) +
+                     ''.join(rows)),
                     ('open' if is_open else '')
                 ])
 
@@ -363,7 +636,7 @@ def table(title, results, is_strong=False, is_open=False):
 """
 
     def html(self):
-
+        """Report results of benchmark comparisons in HTML format."""
         def row(name, old, new, delta, speedup, speedup_color):
             return self.HTML_ROW.format(
                 name, old, new, delta, speedup_color, speedup)
@@ -373,11 +646,12 @@ def header(contents):
 
         def table(title, results, speedup_color):
             rows = [
-                row(*(result_comparison.values() + (speedup_color,)))
-                for result_comparison in results
+                row(*(ReportFormatter.values(r) + (speedup_color,)))
+                for r in results
             ]
             return ('' if not rows else
-                    header((title, len(results)) + results[0].header[1:]) +
+                    header((title, len(results)) +
+                           ReportFormatter.header_for(results[0])[1:]) +
                     ''.join(rows))
 
         return self.HTML.format(
@@ -423,8 +697,10 @@ def parse_args(args):
 
 
 def main():
+    """Compare benchmarks for changes in a formatted report."""
     args = parse_args(sys.argv[1:])
-    comparator = TestComparator(args.old_file, args.new_file,
+    comparator = TestComparator(LogParser.results_from_file(args.old_file),
+                                LogParser.results_from_file(args.new_file),
                                 args.delta_threshold)
     formatter = ReportFormatter(comparator, args.old_branch, args.new_branch,
                                 args.changes_only, args.single_table)
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
new file mode 100644
index 0000000000000..7bebca0847407
--- /dev/null
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -0,0 +1,567 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ===--- test_Benchmark_Driver.py ----------------------------------------===//
+#
+#  This source file is part of the Swift.org open source project
+#
+#  Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
+#  Licensed under Apache License v2.0 with Runtime Library Exception
+#
+#  See https://swift.org/LICENSE.txt for license information
+#  See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+#
+# ===---------------------------------------------------------------------===//
+
+import logging
+import os
+import unittest
+
+from imp import load_source
+
+from compare_perf_tests import PerformanceTestResult
+
+from test_utils import Mock, MockLoggingHandler, Stub, captured_output
+
+# import Benchmark_Driver  # doesn't work because it misses '.py' extension
+Benchmark_Driver = load_source(
+    'Benchmark_Driver', os.path.join(os.path.dirname(
+        os.path.abspath(__file__)), 'Benchmark_Driver'))
+# from Benchmark_Driver import parse_args
+parse_args = Benchmark_Driver.parse_args
+BenchmarkDriver = Benchmark_Driver.BenchmarkDriver
+BenchmarkDoctor = Benchmark_Driver.BenchmarkDoctor
+LoggingReportFormatter = Benchmark_Driver.LoggingReportFormatter
+
+
+class Test_parse_args(unittest.TestCase):
+    def assert_contains(self, texts, output):
+        assert not isinstance(texts, str)
+        for text in texts:
+            self.assertIn(text, output)
+
+    def test_requires_command_argument(self):
+        with captured_output() as (_, err):
+            self.assertRaises(SystemExit, parse_args, [])
+        self.assert_contains(['usage:', 'COMMAND', 'too few arguments'],
+                             err.getvalue())
+
+    def test_command_help_lists_commands(self):
+        with captured_output() as (out, _):
+            self.assertRaises(SystemExit, parse_args, ['-h'])
+        self.assert_contains(['COMMAND', 'run', 'compare', 'check'],
+                             out.getvalue())
+
+    def test_run_benchmarks_by_name_or_ordinal(self):
+        benchmarks = ['AngryPhonebook', '42']
+        self.assertEquals(
+            parse_args(['run'] + benchmarks).benchmarks, benchmarks)
+
+    def test_run_benchmarks_matching_pattern(self):
+        regexes = ['Prefix', '.*Suffix.*']
+        filters = ['-f', regexes[0], '-f', regexes[1]]
+        self.assertEquals(parse_args(['run'] + filters).filters, regexes)
+
+    def test_run_benchmarks_and_filters_are_exclusive(self):
+        with captured_output() as (_, err):
+            self.assertRaises(SystemExit,
+                              parse_args, 'run -f Filter1 Benchmark1'.split())
+        self.assert_contains(
+            ['error',
+             'argument BENCHMARK: not allowed with argument -f/--filter'],
+            err.getvalue())
+
+    def test_tests_location(self):
+        here = os.path.dirname(os.path.abspath(__file__))
+        self.assertEquals(parse_args(['run']).tests, here)
+        tests = '/benchmarks/are/here'
+        self.assertEquals(parse_args(['run', '-t', tests]).tests, tests)
+
+    def test_optimization_argument(self):
+        self.assertEquals(parse_args(['run']).optimization, 'O')
+        self.assertEquals(
+            parse_args(['run', '-o', 'O']).optimization, 'O')
+        self.assertEquals(
+            parse_args(['run', '-o', 'Onone']).optimization, 'Onone')
+        self.assertEquals(
+            parse_args(['run', '-o', 'Osize']).optimization, 'Osize')
+
+        with captured_output() as (_, err):
+            self.assertRaises(SystemExit,
+                              parse_args, ['run', '-o', 'bogus'])
+        self.assert_contains(
+            ['error:',
+             "argument -o/--optimization: invalid choice: 'bogus'",
+             "(choose from 'O', 'Onone', 'Osize')"],
+            err.getvalue())
+
+    def test_iterations(self):
+        self.assertEquals(parse_args(['run']).iterations, 1)
+        self.assertEquals(parse_args(['run', '-i', '3']).iterations, 3)
+        with captured_output() as (out, err):
+            self.assertRaises(SystemExit,
+                              parse_args, ['run', '-i', '-3'])
+        self.assert_contains(
+            ['error:',
+             "argument -i/--iterations: invalid positive_int value: '-3'"],
+            err.getvalue())
+
+    def test_check_supports_vebose_output(self):
+        self.assertFalse(parse_args(['check']).verbose)
+        self.assertTrue(parse_args(['check', '-v']).verbose)
+        self.assertTrue(parse_args(['check', '--verbose']).verbose)
+
+
+class ArgsStub(object):
+    def __init__(self):
+        self.benchmarks = None
+        self.filters = None
+        self.tests = '/benchmarks/'
+        self.optimization = 'O'
+
+
+class SubprocessMock(Mock):
+    """Mock for subprocess module's `check_output` method."""
+    STDOUT = object()
+
+    def __init__(self, responses=None):
+        super(SubprocessMock, self).__init__()
+
+        def _check_output(args, stdin=None, stdout=None, stderr=None,
+                          shell=False):
+            return self.record_and_respond(args, stdin, stdout, stderr, shell)
+        self.check_output = _check_output
+
+    def record_and_respond(self, args, stdin, stdout, stderr, shell):
+        # _ = stdin, stdout, shell  # ignored in mock
+        assert stderr == self.STDOUT, 'Errors are NOT redirected to STDOUT'
+        args = tuple(args)
+        self.calls.append(args)
+        return self.respond.get(args, '')
+
+
+class TestBenchmarkDriverInitialization(unittest.TestCase):
+    def setUp(self):
+        self.args = ArgsStub()
+        self.subprocess_mock = SubprocessMock()
+
+    def test_test_harness(self):
+        self.assertEquals(
+            BenchmarkDriver(self.args, tests=['ignored']).test_harness,
+            '/benchmarks/Benchmark_O')
+        self.args.tests = '/path'
+        self.args.optimization = 'Suffix'
+        self.assertEquals(
+            BenchmarkDriver(self.args, tests=['ignored']).test_harness,
+            '/path/Benchmark_Suffix')
+
+    def test_gets_list_of_precommit_benchmarks(self):
+        self.subprocess_mock.expect(
+            '/benchmarks/Benchmark_O --list --delim=\t'.split(' '),
+            '#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n1\tBenchmark2\t[t3]\n')
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
+        self.subprocess_mock.assert_called_all_expected()
+        self.assertEquals(driver.tests,
+                          ['Benchmark1', 'Benchmark2'])
+        self.assertEquals(driver.all_tests,
+                          ['Benchmark1', 'Benchmark2'])
+
+    list_all_tests = (
+        '/benchmarks/Benchmark_O --list --delim=\t --skip-tags='.split(' '),
+        """#	Test	[Tags]
+1	Benchmark1	[t1, t2]
+2	Benchmark2	[t3]
+3	Benchmark3	[t3, t4]
+""")
+
+    def test_gets_list_of_all_benchmarks_when_benchmarks_args_exist(self):
+        """Filters tests by name or test number, ignoring unknown."""
+        self.args.benchmarks = '1 Benchmark3 1 bogus'.split()
+        self.subprocess_mock.expect(*self.list_all_tests)
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
+        self.subprocess_mock.assert_called_all_expected()
+        self.assertEquals(driver.tests, ['Benchmark1', 'Benchmark3'])
+        self.assertEquals(driver.all_tests,
+                          ['Benchmark1', 'Benchmark2', 'Benchmark3'])
+
+    def test_filters_benchmarks_by_pattern(self):
+        self.args.filters = '-f .+3'.split()
+        self.subprocess_mock.expect(*self.list_all_tests)
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
+        self.subprocess_mock.assert_called_all_expected()
+        self.assertEquals(driver.tests, ['Benchmark3'])
+        self.assertEquals(driver.all_tests,
+                          ['Benchmark1', 'Benchmark2', 'Benchmark3'])
+
+
+class LogParserStub(object):
+    results_from_string_called = False
+
+    @staticmethod
+    def results_from_string(log_contents):
+        LogParserStub.results_from_string_called = True
+        r = PerformanceTestResult('3,b1,1,123,123,123,0,123'.split(','))
+        return {'b1': r}
+
+
+class TestBenchmarkDriverRunningTests(unittest.TestCase):
+    def setUp(self):
+        self.args = ArgsStub()
+        self.parser_stub = LogParserStub()
+        self.subprocess_mock = SubprocessMock()
+        self.subprocess_mock.expect(
+            '/benchmarks/Benchmark_O --list --delim=\t'.split(' '),
+            '#\tTest\t[Tags]\n1\tb1\t[tag]\n')
+        self.driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock,
+            parser=self.parser_stub)
+
+    def test_run_benchmark_with_multiple_samples(self):
+        self.driver.run('b1')
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b1'))
+        self.driver.run('b2', num_samples=5)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b2', '--num-samples=5'))
+
+    def test_run_benchmark_with_specified_number_of_iterations(self):
+        self.driver.run('b', num_iters=1)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--num-iters=1'))
+
+    def test_run_benchmark_in_verbose_mode(self):
+        self.driver.run('b', verbose=True)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--verbose'))
+
+    def test_parse_results_from_running_benchmarks(self):
+        self.driver.run('b')
+        self.assertTrue(self.parser_stub.results_from_string_called)
+
+    def test_measure_memory(self):
+        self.driver.run('b', measure_memory=True)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--memory'))
+
+    def test_run_benchmark_independent_samples(self):
+        self.driver.args.iterations = 3
+        r = self.driver.run_independent_samples('b1')
+        self.assertEquals(self.subprocess_mock.calls.count(
+            ('/benchmarks/Benchmark_O', 'b1', '--memory')), 3)
+        self.assertEquals(r.num_samples, 3)  # results are merged
+
+    def test_run_bechmarks(self):
+        def mock_run(test):
+            self.assertEquals(test, 'b1')
+            return PerformanceTestResult(
+                '3,b1,1,123,123,123,0,123,888'.split(','))
+        driver = Stub(tests=['b1'])
+        driver.run_independent_samples = mock_run
+        run_benchmarks = Benchmark_Driver.run_benchmarks
+
+        with captured_output() as (out, _):
+            run_benchmarks(driver)
+        self.assertEquals('\n'.join("""
+#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
+3,b1,1,123,123,123,0,123,888
+
+Totals,1
+
+""".splitlines()[1:]), out.getvalue())  # removes 1st \n from multiline string
+
+
+class BenchmarkDriverMock(Mock):
+    """Mock for BenchmarkDriver's `run` method"""
+    def __init__(self, tests, responses=None):
+        super(BenchmarkDriverMock, self).__init__(responses)
+        self.tests = tests
+        self.args = ArgsStub()
+
+        def _run(test, num_samples=None, num_iters=None,
+                 verbose=None, measure_memory=False):
+            return self.record_and_respond(test, num_samples, num_iters,
+                                           verbose, measure_memory)
+        self.run = _run
+
+    def record_and_respond(self, test, num_samples, num_iters,
+                           verbose, measure_memory):
+        args = (test, num_samples, num_iters, verbose, measure_memory)
+        self.calls.append(args)
+        return self.respond.get(args, _PTR(min=700))
+
+
+class TestLoggingReportFormatter(unittest.TestCase):
+    def test_plain_log_format(self):
+        lr = logging.makeLogRecord({
+            'name': 'Base.category', 'level': logging.DEBUG,
+            'levelname': 'DEBUG', 'msg': 'Hi!'})
+        f = LoggingReportFormatter()
+        self.assertEquals(f.format(lr), 'DEBUG category: Hi!')
+
+    def test_colored_log_format(self):
+        def record(level, level_name):
+            return logging.makeLogRecord({
+                'name': 'Base.category', 'levelno': level,
+                'levelname': level_name, 'msg': 'Hi!'})
+        f = LoggingReportFormatter(use_color=True)
+        self.assertEquals(f.format(record(logging.DEBUG, 'DEBUG')),
+                          '\x1b[1;39mcategory: Hi!\x1b[1;0m')
+        self.assertEquals(f.format(record(logging.INFO, 'INFO')),
+                          '\x1b[1;32mcategory: Hi!\x1b[1;0m')
+        self.assertEquals(f.format(record(logging.WARNING, 'WARNING')),
+                          '\x1b[1;33mcategory: Hi!\x1b[1;0m')
+        self.assertEquals(f.format(record(logging.ERROR, 'ERROR')),
+                          '\x1b[1;31mcategory: Hi!\x1b[1;0m')
+        self.assertEquals(f.format(record(logging.CRITICAL, 'CRITICAL')),
+                          '\x1b[1;35mcategory: Hi!\x1b[1;0m')
+
+    def test_no_prefix_for_base_logging(self):
+        lr = logging.makeLogRecord({
+            'name': 'Base', 'level': logging.INFO,
+            'levelname': 'INFO', 'msg': 'Hi!'})
+        f = LoggingReportFormatter()
+        self.assertEquals(f.format(lr), 'INFO Hi!')
+
+
+def _PTR(min=700, mem_pages=1000):
+    """Create PerformanceTestResult Stub."""
+    return Stub(min=min, mem_pages=mem_pages)
+
+
+def _run(test, num_samples=None, num_iters=None, verbose=None,
+         measure_memory=False):
+    """Helper function that constructs tuple with arguments for run method."""
+    return (
+        test, num_samples, num_iters, verbose, measure_memory)
+
+
+class TestBenchmarkDoctor(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super(TestBenchmarkDoctor, cls).setUpClass()
+        doctor_log = logging.getLogger('BenchmarkDoctor')
+        cls._doctor_log_handler = MockLoggingHandler(level='DEBUG')
+        doctor_log.addHandler(cls._doctor_log_handler)
+
+    def setUp(self):
+        super(TestBenchmarkDoctor, self).setUp()
+        self.args = Stub(verbose=False)
+        self._doctor_log_handler.reset()
+        self.logs = self._doctor_log_handler.messages
+
+    def assert_contains(self, texts, output):
+        assert not isinstance(texts, str)
+        for text in texts:
+            self.assertIn(text, output)
+
+    def test_uses_logging(self):
+        driver = BenchmarkDriverMock(tests=['B1', 'B2'])
+        with captured_output() as (out, _):
+            BenchmarkDoctor(self.args, driver)
+        self.assert_contains(['Checking tests: B1, B2'], self.logs['debug'])
+        self.assertEquals(out.getvalue(), '')
+
+    def test_supports_verbose_output(self):
+        driver = BenchmarkDriverMock(tests=['B1', 'B2'])
+        driver.verbose = True
+        with captured_output() as (out, _):
+            BenchmarkDoctor(Stub(verbose=True), driver)
+        self.assert_contains(['Checking tests: B1, B2'], out.getvalue())
+
+    def test_uses_report_formatter(self):
+        doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock(tests=['B1']))
+        console_handler = logging.getLogger('BenchmarkDoctor').handlers[1]
+        self.assertTrue(doctor)
+        self.assertTrue(isinstance(console_handler, logging.StreamHandler))
+        self.assertTrue(isinstance(console_handler.formatter,
+                                   LoggingReportFormatter))
+
+    def test_measure_10_independent_1s_benchmark_series(self):
+        """Measurement strategy takes 5 i2 and 5 i1 series.
+
+        Num-samples for Benchmark Driver are calibrated to be powers of two,
+        take measurements for approximately 1s
+        based on short initial runtime sampling. Capped at 2k samples.
+        """
+        driver = BenchmarkDriverMock(tests=['B1'], responses=([
+            # calibration run, returns a stand-in for PerformanceTestResult
+            (_run('B1', num_samples=3, num_iters=1), _PTR(min=300))] +
+            # 5x i1 series, with 300 μs runtime its possible to take 4098
+            # samples/s, but it should be capped at 2k
+            ([(_run('B1', num_samples=2048, num_iters=1,
+                    verbose=True), _PTR(min=300))] * 5) +
+            # 5x i2 series
+            ([(_run('B1', num_samples=2048, num_iters=2,
+                    verbose=True), _PTR(min=300))] * 5)
+        ))
+        doctor = BenchmarkDoctor(self.args, driver)
+        with captured_output() as (out, _):
+            measurements = doctor.measure('B1')
+
+        driver.assert_called_all_expected()
+        self.assert_contains(
+            ['name',
+             'B1 O i1a', 'B1 O i1b', 'B1 O i1c', 'B1 O i1d', 'B1 O i1e',
+             'B1 O i2a', 'B1 O i2b', 'B1 O i2c', 'B1 O i2d', 'B1 O i2e'],
+            measurements.keys())
+        self.assertEquals(measurements['name'], 'B1')
+        self.assert_contains(
+            ['Calibrating num-samples for B1:',
+             'Runtime 300 μs yields 4096 adjusted samples per second.',
+             'Measuring B1, 5 x i1 (2048 samples), 5 x i2 (2048 samples)'],
+            self.logs['debug'])
+
+    def test_benchmark_name_matches_capital_words_conventions(self):
+        driver = BenchmarkDriverMock(tests=[
+            'BenchmarkName', 'CapitalWordsConvention', 'ABBRName',
+            'wrongCase', 'Wrong_convention'])
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, driver)
+            doctor.check()
+        output = out.getvalue()
+
+        self.assertIn('naming: ', output)
+        self.assertNotIn('BenchmarkName', output)
+        self.assertNotIn('CapitalWordsConvention', output)
+        self.assertNotIn('ABBRName', output)
+        self.assert_contains(
+            ["'wrongCase' name doesn't conform to UpperCamelCase convention.",
+             "'Wrong_convention' name doesn't conform to UpperCamelCase "
+             "convention."], self.logs['error'])
+        self.assert_contains(
+            ['See http://bit.ly/UpperCamelCase'], self.logs['info'])
+
+    def test_benchmark_name_is_at_most_40_chars_long(self):
+        driver = BenchmarkDriverMock(tests=[
+            'BenchmarkName',
+            'ThisTestNameIsTooLongAndCausesOverflowsInReports'])
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, driver)
+            doctor.check()
+        output = out.getvalue()
+
+        self.assertIn('naming: ', output)
+        self.assertNotIn('BenchmarkName', output)
+        self.assert_contains(
+            ["'ThisTestNameIsTooLongAndCausesOverflowsInReports' name is "
+             "48 characters long."], self.logs['error'])
+        self.assert_contains(
+            ["Benchmark name should not be longer than 40 characters."],
+            self.logs['info'])
+
+    def test_benchmark_runtime_range(self):
+        """Optimized benchmark should run in less then 2500 μs.
+
+        With runtimes less than 2500 μs there is better than 1:4 chance of
+        being interrupted in the middle of measurement due to elapsed 10 ms
+        quantum used by macos scheduler.
+
+        Warn about longer runtime. Runtimes over half a second are an error.
+        """
+        def measurements(name, runtime):
+            return {'name': name,
+                    name + ' O i1a': _PTR(min=runtime + 2),
+                    name + ' O i2a': _PTR(min=runtime)}
+
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze(measurements('Cheetah', 200))
+            doctor.analyze(measurements('Hare', 2501))
+            doctor.analyze(measurements('Tortoise', 500000))
+            doctor.analyze({'name': 'OverheadTurtle',
+                            'OverheadTurtle O i1a': _PTR(min=800000),
+                            'OverheadTurtle O i2a': _PTR(min=700000)})
+        output = out.getvalue()
+
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('Cheetah', output)
+        self.assert_contains(["'Hare' execution took at least 2501 μs."],
+                             self.logs['warning'])
+        self.assert_contains(
+            ["Decrease the workload of 'Hare' by a factor of 2, "
+             "to be less than 2500 μs."], self.logs['info'])
+        self.assert_contains(
+            ["'Tortoise' execution took at least 500000 μs."],
+            self.logs['error'])
+        self.assert_contains(
+            ["Decrease the workload of 'Tortoise' by a factor of 256, "
+             "to be less than 2500 μs."], self.logs['info'])
+        self.assert_contains(
+            ["'OverheadTurtle' execution took at least 600000 μs"
+             " (excluding the setup overhead)."],
+            self.logs['error'])
+
+    def test_benchmark_has_no_significant_setup_overhead(self):
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze({
+                'name': 'NoOverhead',  # not 'significant' enough
+                # Based on DropFirstArray a10/e10: overhead 3.7% (6 μs)
+                'NoOverhead O i1a': _PTR(min=162),
+                'NoOverhead O i2a': _PTR(min=159)})
+            doctor.analyze({
+                'name': 'SO',  # Setup Overhead
+                # Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs)
+                'SO O i1a': _PTR(min=69), 'SO O i1b': _PTR(min=70),
+                'SO O i2a': _PTR(min=67), 'SO O i2b': _PTR(min=68)})
+            doctor.analyze({'name': 'Zero', 'Zero O i1a': _PTR(min=0),
+                            'Zero O i2a': _PTR(min=0)})
+        output = out.getvalue()
+
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('NoOverhead', output)
+        self.assertNotIn('ZeroRuntime', output)
+        self.assert_contains(
+            ["'SO' has setup overhead of 4 μs (5.8%)."],
+            self.logs['error'])
+        self.assert_contains(
+            ["Move initialization of benchmark data to the `setUpFunction` "
+             "registered in `BenchmarkInfo`."], self.logs['info'])
+
+    def test_benchmark_has_constant_memory_use(self):
+        """Benchmark's memory footprint must not vary with num-iters."""
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze({
+                # The threshold of 15 pages was estimated from previous
+                # measurements. The normal range should be probably aproximated
+                # by a function instead of a simple constant.
+                # TODO: re-evaluate normal range from whole SBS
+                'name': 'ConstantMemory',
+                'ConstantMemory O i1a': _PTR(mem_pages=1460),
+                'ConstantMemory O i2a': _PTR(mem_pages=(1460 + 15))})
+            doctor.analyze({
+                'name': 'VariableMemory',  # ObserverForwardStruct
+                'VariableMemory O i1a': _PTR(mem_pages=1460),
+                'VariableMemory O i1b': _PTR(mem_pages=1472),
+                # i2 series start at 290 pages higher
+                'VariableMemory O i2a': _PTR(mem_pages=1750),
+                'VariableMemory O i2b': _PTR(mem_pages=1752)})
+            measurements = dict([
+                ('HighVariance O i{0}{1}'.format(num_iters, suffix),
+                 _PTR(mem_pages=num_pages))
+                for num_iters, pages in [
+                    (1, [6200, 5943, 4818, 5612, 5469]),
+                    (2, [6244, 5832, 4674, 5176, 5490])]
+                for num_pages, suffix in zip(pages, list('abcde'))])
+            measurements['name'] = 'HighVariance'  # Array2D
+            doctor.analyze(measurements)
+        output = out.getvalue()
+
+        self.assertIn('memory: ', output)
+        self.assertNotIn('ConstantMemory', output)
+        self.assert_contains(
+            ["'VariableMemory' varies the memory footprint of the base "
+             "workload depending on the `num-iters`."],
+            self.logs['error'])
+        self.assert_contains(
+            ["'HighVariance' has very wide range of memory used between "
+             "independent, repeated measurements."],
+            self.logs['warning'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index fc7f4e4454e56..f1f578cb54cbe 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -19,36 +19,174 @@
 import tempfile
 import unittest
 
-from StringIO import StringIO
-from contextlib import contextmanager
-
+from compare_perf_tests import LogParser
 from compare_perf_tests import PerformanceTestResult
+from compare_perf_tests import PerformanceTestSamples
 from compare_perf_tests import ReportFormatter
 from compare_perf_tests import ResultComparison
+from compare_perf_tests import Sample
 from compare_perf_tests import TestComparator
 from compare_perf_tests import main
 from compare_perf_tests import parse_args
 
+from test_utils import captured_output
 
-@contextmanager
-def captured_output():
-    new_out, new_err = StringIO(), StringIO()
-    old_out, old_err = sys.stdout, sys.stderr
-    try:
-        sys.stdout, sys.stderr = new_out, new_err
-        yield sys.stdout, sys.stderr
-    finally:
-        sys.stdout, sys.stderr = old_out, old_err
 
+class TestSample(unittest.TestCase):
+    def test_has_named_fields(self):
+        s = Sample(1, 2, 3)
+        self.assertEquals(s.i, 1)
+        self.assertEquals(s.num_iters, 2)
+        self.assertEquals(s.runtime, 3)
 
-class TestPerformanceTestResult(unittest.TestCase):
+    def test_is_iterable(self):
+        s = Sample(1, 2, 3)
+        self.assertEquals(s[0], 1)
+        self.assertEquals(s[1], 2)
+        self.assertEquals(s[2], 3)
 
+
+class TestPerformanceTestSamples(unittest.TestCase):
+    def setUp(self):
+        self.samples = PerformanceTestSamples('B1')
+        self.samples.add(Sample(7, 42, 1000))
+
+    def test_has_name(self):
+        self.assertEquals(self.samples.name, 'B1')
+
+    def test_stores_samples(self):
+        self.assertEquals(self.samples.count, 1)
+        s = self.samples.samples[0]
+        self.assertTrue(isinstance(s, Sample))
+        self.assertEquals(s.i, 7)
+        self.assertEquals(s.num_iters, 42)
+        self.assertEquals(s.runtime, 1000)
+
+    def assertEqualFiveNumberSummary(self, ss, expected_fns):
+        e_min, e_q1, e_median, e_q3, e_max = expected_fns
+        self.assertEquals(ss.min, e_min)
+        self.assertEquals(ss.q1, e_q1)
+        self.assertEquals(ss.median, e_median)
+        self.assertEquals(ss.q3, e_q3)
+        self.assertEquals(ss.max, e_max)
+
+    def test_computes_five_number_summary(self):
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1000, 1000, 1000))
+        self.samples.add(Sample(2, 1, 1100))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1100, 1100, 1100))
+        self.samples.add(Sample(3, 1, 1050))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1050, 1050, 1100))
+        self.samples.add(Sample(4, 1, 1025))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1025, 1050, 1100, 1100))
+        self.samples.add(Sample(5, 1, 1075))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1025, 1050, 1075, 1100))
+
+    def test_computes_inter_quartile_range(self):
+        self.assertEquals(self.samples.iqr, 0)
+        self.samples.add(Sample(2, 1, 1025))
+        self.samples.add(Sample(3, 1, 1050))
+        self.samples.add(Sample(4, 1, 1075))
+        self.samples.add(Sample(5, 1, 1100))
+        self.assertEquals(self.samples.iqr, 50)
+
+    def assertEqualStats(self, stats, expected_stats):
+        for actual, expected in zip(stats, expected_stats):
+            self.assertAlmostEquals(actual, expected, places=2)
+
+    def test_computes_mean_sd_cv(self):
+        ss = self.samples
+        self.assertEqualStats(
+            (ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
+        self.samples.add(Sample(2, 1, 1100))
+        self.assertEqualStats(
+            (ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
+
+    def test_computes_range_spread(self):
+        ss = self.samples
+        self.assertEqualStats(
+            (ss.range, ss.spread), (0, 0))
+        self.samples.add(Sample(2, 1, 1100))
+        self.assertEqualStats(
+            (ss.range, ss.spread), (100, 10.0 / 100))
+
+    def test_init_with_samples(self):
+        self.samples = PerformanceTestSamples(
+            'B2', [Sample(0, 1, 1000), Sample(1, 1, 1100)])
+        self.assertEquals(self.samples.count, 2)
+        self.assertEqualStats(
+            (self.samples.mean, self.samples.sd,
+             self.samples.range, self.samples.spread),
+            (1050.0, 70.71, 100, 9.52 / 100))
+
+    def test_can_handle_zero_runtime(self):
+        # guard against dividing by 0
+        self.samples = PerformanceTestSamples('Zero')
+        self.samples.add(Sample(0, 1, 0))
+        self.assertEqualStats(
+            (self.samples.mean, self.samples.sd, self.samples.cv,
+             self.samples.range, self.samples.spread),
+            (0, 0, 0.0, 0, 0.0))
+
+    def test_excludes_outliers(self):
+        ss = [Sample(*map(int, s.split())) for s in
+              '0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, '
+              '5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, '
+              '10 1 1050, 11 1 949, 12 1 1151'.split(',')]
+        self.samples = PerformanceTestSamples('Outliers', ss)
+        self.assertEquals(self.samples.count, 13)
+        self.assertEqualStats(
+            (self.samples.mean, self.samples.sd), (1050, 52.36))
+
+        self.samples.exclude_outliers()
+
+        self.assertEquals(self.samples.count, 11)
+        self.assertEquals(self.samples.outliers, ss[11:])
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1025, 1050, 1075, 1100))
+        self.assertEqualStats(
+            (self.samples.mean, self.samples.sd), (1050, 35.36))
+
+    def test_excludes_outliers_zero_IQR(self):
+        self.samples = PerformanceTestSamples('Tight')
+        self.samples.add(Sample(0, 2, 23))
+        self.samples.add(Sample(1, 2, 18))
+        self.samples.add(Sample(2, 2, 18))
+        self.assertEquals(self.samples.iqr, 0)
+
+        self.samples.exclude_outliers()
+
+        self.assertEquals(self.samples.count, 2)
+        self.assertEqualStats(
+            (self.samples.min, self.samples.max), (18, 18))
+
+    def test_excludes_outliers_top_only(self):
+        ss = [Sample(*map(int, s.split())) for s in
+              '0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3'.split(',')]
+        self.samples = PerformanceTestSamples('Top', ss)
+        self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3))
+        self.assertEquals(self.samples.iqr, 0)
+
+        self.samples.exclude_outliers(top_only=True)
+
+        self.assertEquals(self.samples.count, 4)
+        self.assertEqualStats((self.samples.min, self.samples.max), (1, 2))
+
+
+class TestPerformanceTestResult(unittest.TestCase):
     def test_init(self):
         log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
         r = PerformanceTestResult(log_line.split(','))
+        self.assertEquals(r.test_num, '1')
         self.assertEquals(r.name, 'AngryPhonebook')
-        self.assertEquals((r.samples, r.min, r.max, r.mean, r.sd, r.median),
-                          (20, 10664, 12933, 11035, 576, 10884))
+        self.assertEquals(
+            (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
+            (20, 10664, 12933, 11035, 576, 10884))
+        self.assertEquals(r.samples, None)
 
         log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
         r = PerformanceTestResult(log_line.split(','))
@@ -60,25 +198,7 @@ def test_repr(self):
         self.assertEquals(
             str(r),
             '<PerformanceTestResult name:\'AngryPhonebook\' samples:20 '
-            'min:10664 max:12933 mean:11035 sd:576.0 median:10884>'
-        )
-
-    def test_header(self):
-        self.assertEquals(PerformanceTestResult.header,
-                          ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS'))
-
-    def test_values(self):
-        log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
-        r = PerformanceTestResult(log_line.split(','))
-        self.assertEquals(
-            r.values(),
-            ('AngryPhonebook', '10664', '12933', '11035', '—')
-        )
-        log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
-        r = PerformanceTestResult(log_line.split(','))
-        self.assertEquals(
-            r.values(),
-            ('AngryPhonebook', '12045', '12045', '12045', '10510336')
+            'min:10664 max:12933 mean:11035 sd:576 median:10884>'
         )
 
     def test_merge(self):
@@ -90,21 +210,21 @@ def test_merge(self):
                       [line.split(',') for line in tests])
 
         def as_tuple(r):
-            return (r.min, r.max, round(r.mean, 2), round(r.sd, 2), r.median,
-                    r.max_rss)
+            return (r.num_samples, r.min, r.max, round(r.mean, 2),
+                    r.sd, r.median, r.max_rss)
 
         r = results[0]
         self.assertEquals(as_tuple(r),
-                          (12045, 12045, 12045, 0, 12045, 10510336))
+                          (1, 12045, 12045, 12045, 0, 12045, 10510336))
         r.merge(results[1])
-        self.assertEquals(as_tuple(r),
-                          (12045, 12325, 12185, 197.99, 12045, 10510336))
+        self.assertEquals(as_tuple(r),  # drops SD and median
+                          (2, 12045, 12325, 12185, 0, 0, 10510336))
         r.merge(results[2])
-        self.assertEquals(as_tuple(r),
-                          (11616, 12325, 11995.33, 357.10, 12045, 10510336))
+        self.assertEquals(as_tuple(r),  # picks smaller of the MAX_RSS
+                          (3, 11616, 12325, 11995.33, 0, 0, 10502144))
         r.merge(results[3])
         self.assertEquals(as_tuple(r),
-                          (11616, 12325, 12064, 322.29, 12045, 10510336))
+                          (4, 11616, 12325, 12064, 0, 0, 10498048))
 
 
 class TestResultComparison(unittest.TestCase):
@@ -142,65 +262,20 @@ def test_init(self):
             ResultComparison, self.r0, self.r1
         )
 
-    def test_header(self):
-        self.assertEquals(ResultComparison.header,
-                          ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP'))
-
-    def test_values(self):
-        rc = ResultComparison(self.r1, self.r2)
-        self.assertEquals(
-            rc.values(),
-            ('AngryPhonebook', '12325', '11616', '-5.8%', '1.06x')
-        )
-        # other way around
-        rc = ResultComparison(self.r2, self.r1)
-        self.assertEquals(
-            rc.values(),
-            ('AngryPhonebook', '11616', '12325', '+6.1%', '0.94x')
-        )
-
     def test_values_is_dubious(self):
+        self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious)
         self.r2.max = self.r1.min + 1
         # new.min < old.min < new.max
-        rc = ResultComparison(self.r1, self.r2)
-        self.assertEquals(rc.values()[4], '1.06x (?)')
+        self.assertTrue(ResultComparison(self.r1, self.r2).is_dubious)
         # other way around: old.min < new.min < old.max
-        rc = ResultComparison(self.r2, self.r1)
-        self.assertEquals(rc.values()[4], '0.94x (?)')
+        self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious)
 
 
-class OldAndNewLog(unittest.TestCase):
+class FileSystemIntegration(unittest.TestCase):
     def setUp(self):
         # Create a temporary directory
         self.test_dir = tempfile.mkdtemp()
 
-        self.old_log = self.write_temp_file('old.log', """
-1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365
-2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445
-3,Array2D,20,335831,400221,346622,0,346622,28297216
-4,ArrayAppend,20,23641,29000,24990,0,24990,11149926
-34,BitCount,20,3,4,4,0,4,10192896
-35,ByteSwap,20,4,6,4,0,4,10185933
-
-Totals,269,67351871,70727022,68220188,0,0,0
-""")
-        # Log lines are deliberately in descending sort order in order to test
-        # the ordering of the comparison results.
-        # ArrayAppend is included twice to test results merging.
-        self.new_log = self.write_temp_file('new.log', """
-265,TwoSum,20,5006,5679,5111,0,5111
-35,ByteSwap,20,0,0,0,0,0
-34,BitCount,20,9,9,9,0,9
-4,ArrayAppend,20,23641,29000,24990,0,24990
-3,Array2D,20,335831,400221,346622,0,346622
-1,AngryPhonebook,20,10458,12714,11000,0,11000
-
-Totals,269,67351871,70727022,68220188,0,0,0
-4,ArrayAppend,1,20000,20000,20000,0,20000
-
-Totals,1,20000,20000,20000,0,0,0
-""")
-
     def tearDown(self):
         # Remove the directory after the test
         shutil.rmtree(self.test_dir)
@@ -211,27 +286,217 @@ def write_temp_file(self, file_name, data):
             f.write(data)
         return temp_file_name
 
+
+class OldAndNewLog(unittest.TestCase):
+    old_log_content = """1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365
+2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445
+3,Array2D,20,335831,400221,346622,0,346622,28297216
+4,ArrayAppend,20,23641,29000,24990,0,24990,11149926
+34,BitCount,20,3,4,4,0,4,10192896
+35,ByteSwap,20,4,6,4,0,4,10185933"""
+
+    new_log_content = """265,TwoSum,20,5006,5679,5111,0,5111
+35,ByteSwap,20,0,0,0,0,0
+34,BitCount,20,9,9,9,0,9
+4,ArrayAppend,20,20000,29000,24990,0,24990
+3,Array2D,20,335831,400221,346622,0,346622
+1,AngryPhonebook,20,10458,12714,11000,0,11000"""
+
+    old_results = dict([(r.name, r)
+                        for r in
+                        map(PerformanceTestResult,
+                            [line.split(',')
+                             for line in
+                             old_log_content.splitlines()])])
+
+    new_results = dict([(r.name, r)
+                        for r in
+                        map(PerformanceTestResult,
+                            [line.split(',')
+                             for line in
+                             new_log_content.splitlines()])])
+
     def assert_report_contains(self, texts, report):
         assert not isinstance(texts, str)
         for text in texts:
             self.assertIn(text, report)
 
 
+class TestLogParser(unittest.TestCase):
+    def test_parse_results_csv(self):
+        """Ignores header row, empty lines and Totals row."""
+        log = """#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
+34,BitCount,20,3,4,4,0,4
+
+Totals,269
+"""
+        parser = LogParser()
+        results = parser.parse_results(log.splitlines())
+        self.assertTrue(isinstance(results[0], PerformanceTestResult))
+        self.assertEquals(results[0].name, 'BitCount')
+
+    def test_parse_results_tab_delimited(self):
+        log = '34\tBitCount\t20\t3\t4\t4\t0\t4'
+        parser = LogParser()
+        results = parser.parse_results(log.splitlines())
+        self.assertTrue(isinstance(results[0], PerformanceTestResult))
+        self.assertEquals(results[0].name, 'BitCount')
+
+    def test_parse_results_formatted_text(self):
+        """Parse format that Benchmark_Driver prints to console"""
+        log = ("""
+# TEST      SAMPLES MIN(μs) MAX(μs) MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)
+3 Array2D        20    2060    2188     2099      0       2099   20915200
+Totals        281""")
+        parser = LogParser()
+        results = parser.parse_results(log.splitlines()[1:])  # without 1st \n
+        self.assertTrue(isinstance(results[0], PerformanceTestResult))
+        r = results[0]
+        self.assertEquals(r.name, 'Array2D')
+        self.assertEquals(r.max_rss, 20915200)
+
+    def test_parse_results_verbose(self):
+        """Parse multiple performance test results with 2 sample formats:
+        single line for N = 1; two lines for N > 1.
+        """
+        verbose_log = """--- DATA ---
+#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
+Running AngryPhonebook for 3 samples.
+    Measuring with scale 78.
+    Sample 0,11812
+    Measuring with scale 90.
+    Sample 1,13898
+    Measuring with scale 91.
+    Sample 2,11467
+1,AngryPhonebook,3,11467,13898,12392,1315,11812
+Running Array2D for 3 samples.
+    Sample 0,369900
+    Sample 1,381039
+    Sample 2,371043
+3,Array2D,3,369900,381039,373994,6127,371043
+
+Totals,2"""
+        parser = LogParser()
+        results = parser.parse_results(verbose_log.split('\n'))
+
+        r = results[0]
+        self.assertEquals(
+            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
+            ('AngryPhonebook', 11467, 13898, 12392, 1315, 11812)
+        )
+        self.assertEquals(r.num_samples, r.samples.num_samples)
+        self.assertEquals(results[0].samples.all_samples,
+                          [(0, 78, 11812), (1, 90, 13898), (2, 91, 11467)])
+
+        r = results[1]
+        self.assertEquals(
+            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
+            ('Array2D', 369900, 381039, 373994, 6127, 371043)
+        )
+        self.assertEquals(r.num_samples, r.samples.num_samples)
+        self.assertEquals(results[1].samples.all_samples,
+                          [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
+
+    def test_parse_environment_verbose(self):
+        """Parse stats about environment in verbose mode."""
+        verbose_log = """    MAX_RSS 8937472 - 8904704 = 32768 (8 pages)
+    ICS 1338 - 229 = 1109
+    VCS 2 - 1 = 1
+2,AngryPhonebook,3,11269,11884,11657,338,11820
+"""
+        parser = LogParser()
+        results = parser.parse_results(verbose_log.split('\n'))
+
+        r = results[0]
+        self.assertEquals(r.max_rss, 32768)
+        self.assertEquals(r.mem_pages, 8)
+        self.assertEquals(r.voluntary_cs, 1)
+        self.assertEquals(r.involuntary_cs, 1109)
+
+    def test_results_from_merge(self):
+        """Parsing concatenated log merges same PerformanceTestResults"""
+        concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990
+4,ArrayAppend,1,20000,20000,20000,0,20000"""
+        results = LogParser.results_from_string(concatenated_logs)
+        self.assertEquals(results.keys(), ['ArrayAppend'])
+        result = results['ArrayAppend']
+        self.assertTrue(isinstance(result, PerformanceTestResult))
+        self.assertEquals(result.min, 20000)
+        self.assertEquals(result.max, 29000)
+
+    def test_results_from_merge_verbose(self):
+        """Parsing verbose log  merges all PerformanceTestSamples.
+        ...this should technically be on TestPerformanceTestResult, but it's
+        easier to write here. ¯\_(ツ)_/¯"""
+        concatenated_logs = """
+    Sample 0,355883
+    Sample 1,358817
+    Sample 2,353552
+    Sample 3,350815
+3,Array2D,4,350815,358817,354766,3403,355883
+    Sample 0,363094
+    Sample 1,369169
+    Sample 2,376131
+    Sample 3,364245
+3,Array2D,4,363094,376131,368159,5931,369169"""
+        results = LogParser.results_from_string(concatenated_logs)
+        self.assertEquals(results.keys(), ['Array2D'])
+        result = results['Array2D']
+        self.assertTrue(isinstance(result, PerformanceTestResult))
+        self.assertEquals(result.min, 350815)
+        self.assertEquals(result.max, 376131)
+        self.assertEquals(result.median, 363094)
+        self.assertAlmostEquals(result.sd, 8443.37, places=2)
+        self.assertAlmostEquals(result.mean, 361463.25, places=2)
+        self.assertEquals(result.num_samples, 8)
+        samples = result.samples
+        self.assertTrue(isinstance(samples, PerformanceTestSamples))
+        self.assertEquals(samples.count, 8)
+
+    def test_excludes_outliers_from_samples(self):
+        verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples.
+    Measuring with scale 2.
+    Sample 0,455
+    Measuring with scale 2.
+    Sample 1,203
+    Measuring with scale 2.
+    Sample 2,205
+    Measuring with scale 2.
+    Sample 3,207
+    Measuring with scale 2.
+    Sample 4,208
+    Measuring with scale 2.
+    Sample 5,206
+    Measuring with scale 2.
+    Sample 6,205
+    Measuring with scale 2.
+    Sample 7,206
+    Measuring with scale 2.
+    Sample 8,208
+    Measuring with scale 2.
+    Sample 9,184
+65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206
+"""
+        parser = LogParser()
+        result = parser.parse_results(verbose_log.split('\n'))[0]
+        self.assertEquals(result.num_samples, 10)
+        self.assertEquals(result.samples.count, 8)
+        self.assertEquals(len(result.samples.outliers), 2)
+
+
 class TestTestComparator(OldAndNewLog):
     def test_init(self):
-        old_log, new_log = self.old_log, self.new_log
-
         def names(tests):
             return [t.name for t in tests]
 
-        tc = TestComparator(old_log, new_log, 0.05)
+        tc = TestComparator(self.old_results, self.new_results, 0.05)
         self.assertEquals(names(tc.unchanged), ['AngryPhonebook', 'Array2D'])
         self.assertEquals(names(tc.increased), ['ByteSwap', 'ArrayAppend'])
         self.assertEquals(names(tc.decreased), ['BitCount'])
         self.assertEquals(names(tc.added), ['TwoSum'])
         self.assertEquals(names(tc.removed), ['AnyHashableWithAClass'])
         # other way around
-        tc = TestComparator(new_log, old_log, 0.05)
+        tc = TestComparator(self.new_results, self.old_results, 0.05)
         self.assertEquals(names(tc.unchanged), ['AngryPhonebook', 'Array2D'])
         self.assertEquals(names(tc.increased), ['BitCount'])
         self.assertEquals(names(tc.decreased), ['ByteSwap', 'ArrayAppend'])
@@ -239,7 +504,7 @@ def names(tests):
         self.assertEquals(names(tc.removed), ['TwoSum'])
         # delta_threshold determines the sorting into change groups;
         # report only change above 100% (ByteSwap's runtime went to 0):
-        tc = TestComparator(old_log, new_log, 1)
+        tc = TestComparator(self.old_results, self.new_results, 1)
         self.assertEquals(
             names(tc.unchanged),
             ['AngryPhonebook', 'Array2D', 'ArrayAppend', 'BitCount']
@@ -251,7 +516,7 @@ def names(tests):
 class TestReportFormatter(OldAndNewLog):
     def setUp(self):
         super(TestReportFormatter, self).setUp()
-        self.tc = TestComparator(self.old_log, self.new_log, 0.05)
+        self.tc = TestComparator(self.old_results, self.new_results, 0.05)
         self.rf = ReportFormatter(self.tc, '', '', changes_only=False)
         self.markdown = self.rf.markdown()
         self.git = self.rf.git()
@@ -266,6 +531,37 @@ def assert_git_contains(self, texts):
     def assert_html_contains(self, texts):
         self.assert_report_contains(texts, self.html)
 
+    def test_values(self):
+        self.assertEquals(
+            ReportFormatter.values(PerformanceTestResult(
+                '1,AngryPhonebook,20,10664,12933,11035,576,10884'.split(','))),
+            ('AngryPhonebook', '10664', '12933', '11035', '—')
+        )
+        self.assertEquals(
+            ReportFormatter.values(PerformanceTestResult(
+                '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
+                .split(','))),
+            ('AngryPhonebook', '12045', '12045', '12045', '10510336')
+        )
+
+        r1 = PerformanceTestResult(
+            '1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336'.split(','))
+        r2 = PerformanceTestResult(
+            '1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144'.split(','))
+        self.assertEquals(
+            ReportFormatter.values(ResultComparison(r1, r2)),
+            ('AngryPhonebook', '12325', '11616', '-5.8%', '1.06x')
+        )
+        self.assertEquals(
+            ReportFormatter.values(ResultComparison(r2, r1)),
+            ('AngryPhonebook', '11616', '12325', '+6.1%', '0.94x')
+        )
+        r2.max = r1.min + 1
+        self.assertEquals(
+            ReportFormatter.values(ResultComparison(r1, r2))[4],
+            '1.06x (?)'  # is_dubious
+        )
+
     def test_justified_columns(self):
         """Table columns are all formated with same width, defined by the
         longest value.
@@ -281,6 +577,16 @@ def test_column_headers(self):
         """Report contains table headers for ResultComparisons and changed
         PerformanceTestResults.
         """
+        performance_test_result = self.tc.added[0]
+        self.assertEquals(
+            ReportFormatter.header_for(performance_test_result),
+            ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
+        )
+        comparison_result = self.tc.increased[0]
+        self.assertEquals(
+            ReportFormatter.header_for(comparison_result),
+            ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP')
+        )
         self.assert_markdown_contains([
             'TEST                  | OLD    | NEW    | DELTA   | SPEEDUP',
             '---                   | ---    | ---    | ---     | ---    ',
@@ -452,7 +758,7 @@ def test_branch_arguments(self):
         self.assertEquals(args.new_branch, 'amazing-optimization')
 
 
-class Test_compare_perf_tests_main(OldAndNewLog):
+class Test_compare_perf_tests_main(OldAndNewLog, FileSystemIntegration):
     """Integration test that invokes the whole comparison script."""
     markdown = [
         '<summary>Regression (1)</summary>',
@@ -466,6 +772,11 @@ class Test_compare_perf_tests_main(OldAndNewLog):
     ]
     html = ['<html>', "<td align='left'>BitCount</td>"]
 
+    def setUp(self):
+        super(Test_compare_perf_tests_main, self).setUp()
+        self.old_log = self.write_temp_file('old.log', self.old_log_content)
+        self.new_log = self.write_temp_file('new.log', self.new_log_content)
+
     def execute_main_with_format(self, report_format, test_output=False):
         report_file = self.test_dir + 'report.log'
         args = ['compare_perf_tests.py',
diff --git a/benchmark/scripts/test_utils.py b/benchmark/scripts/test_utils.py
new file mode 100644
index 0000000000000..c83d130b73d03
--- /dev/null
+++ b/benchmark/scripts/test_utils.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ===--- test_utils.py ---------------------------------------------------===//
+#
+#  This source file is part of the Swift.org open source project
+#
+#  Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
+#  Licensed under Apache License v2.0 with Runtime Library Exception
+#
+#  See https://swift.org/LICENSE.txt for license information
+#  See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+#
+# ===---------------------------------------------------------------------===//
+"""
+Homegrown unit testing helpers.
+
+The issue is that the unittest.mock was added in Python 3.3 and we need to run
+on Python 2.7.
+
+`Stub` and `Mock` are very rudimentary and support only the limited subset of
+common unit testing patterns that is used in this project.
+"""
+
+import logging
+import sys
+
+from StringIO import StringIO
+from contextlib import contextmanager
+
+
+@contextmanager
+def captured_output():
+    """Capture stdout and stderr and return their output as string buffers."""
+    new_out, new_err = StringIO(), StringIO()
+    old_out, old_err = sys.stdout, sys.stderr
+    try:
+        sys.stdout, sys.stderr = new_out, new_err
+        yield sys.stdout, sys.stderr
+    finally:
+        sys.stdout, sys.stderr = old_out, old_err
+
+
+class Stub:
+    """Object literal stub for value objects."""
+
+    def __init__(self, **attributes):
+        """All named parameters will create properties on this object."""
+        self.__dict__.update(attributes)
+
+
+class Mock(object):
+    """Minimal infrastructure for manually mocking calls to single method.
+
+    Classes inheriting from Mock are expected to create their own mock of the
+    tested method with appropriate signature, which appends the call arguments
+    as tuple to the `calls` list and returns the canned response retrieved from
+    the `respond` dictionary.
+    """
+
+    def __init__(self, responses=None):
+        """Optionaly initialized with a list of expected calls. See expect."""
+        self.calls = []
+        self.expected = []
+        self.respond = dict()
+        responses = responses or []
+        for call_args, response in responses:
+            self.expect(call_args, response)
+
+    def expect(self, call_args, response):
+        """Expect invocation of tested method with given arguments.
+
+        Stores the canned reponse in the `respond` dictionary.
+        """
+        call_args = tuple(call_args)
+        self.expected.append(call_args)
+        self.respond[call_args] = response
+
+    def assert_called_with(self, expected_args):
+        """Verify that the tested method was called with provided arguments."""
+        expected_args = tuple(expected_args)
+        assert expected_args in self.calls, (
+            'Expected: {0} in Called: {1}'.format(expected_args, self.calls))
+
+    def assert_called_all_expected(self):
+        """Verify that all expeced invocations of tested method were called."""
+        assert self.calls == self.expected, (
+            '\nExpected: {0}, \n  Called: {1}'.format(
+                self.expected, self.calls))
+
+
+class MockLoggingHandler(logging.Handler):
+    """Mock logging handler to check for expected logs."""
+
+    def __init__(self, *args, **kwargs):
+        """Prepare the logger for recording messages for each log level."""
+        self.reset()
+        super(MockLoggingHandler, self).__init__(*args, **kwargs)
+
+    def emit(self, record):
+        """Store the message in list for the given log level."""
+        self.messages[record.levelname.lower()].append(record.getMessage())
+
+    def reset(self):
+        """Clear all log messages."""
+        self.messages = {
+            'debug': [], 'info': [], 'warning': [], 'error': [], 'critical': []
+        }
diff --git a/benchmark/utils/DriverUtils.swift b/benchmark/utils/DriverUtils.swift
index 9e9b0e067507a..53f6ea027d058 100644
--- a/benchmark/utils/DriverUtils.swift
+++ b/benchmark/utils/DriverUtils.swift
@@ -160,6 +160,7 @@ struct TestConfig {
             LogMemory: \(logMemory)
             SampleTime: \(sampleTime)
             FixedIters: \(fixedNumIters)
+            Delimiter: \(String(reflecting: delim))
             Tests Filter: \(c.tests ?? [])
             Tests to run: \(testList)
 
diff --git a/test/benchmark/Benchmark_Driver.test-sh b/test/benchmark/Benchmark_Driver.test-sh
index dec24a369da2b..ad7559c6147da 100644
--- a/test/benchmark/Benchmark_Driver.test-sh
+++ b/test/benchmark/Benchmark_Driver.test-sh
@@ -4,19 +4,8 @@
 // REQUIRES: CMAKE_GENERATOR=Ninja
 
 // Integration tests between Benchmark_Driver and Benchmark_O
-// TODO: Keep the "run just once" check and move the rest into unit tests for
-//       Benchmark_Driver, as they are redundant and unnecessarily slow.
-
-// RUN: %Benchmark_Driver run Ackermann | %FileCheck %s --check-prefix RUNNAMED
-// RUNNAMED: Ackermann
-
-// RUN: %Benchmark_Driver run 1 | %FileCheck %s --check-prefix RUNBYNUMBER
-// RUNBYNUMBER: Ackermann
 
 // RUN: %Benchmark_Driver run 1 Ackermann 1 \
 // RUN:                 | %FileCheck %s --check-prefix RUNJUSTONCE
 // RUNJUSTONCE-LABEL: Ackermann
 // RUNJUSTONCE-NOT: Ackermann
-
-// RUN: %Benchmark_Driver run -f Acker | %FileCheck %s --check-prefix RUNFILTER
-// RUNFILTER: Ackermann
diff --git a/test/benchmark/Benchmark_O.test.md b/test/benchmark/Benchmark_O.test.md
index 3e07a76aba807..ac8419eb54e94 100644
--- a/test/benchmark/Benchmark_O.test.md
+++ b/test/benchmark/Benchmark_O.test.md
@@ -143,6 +143,17 @@ LOGMEMORY: ,{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}}
 LOGVERBOSE-LABEL: Running AngryPhonebook for 2 samples.
 ````
 
+Verify the specified delimiter is used when logging to console. The non-verbose
+variant of this invocation is used from [`Benchmark_Driver`][BD] to get the list
+of all tests. That's why it is *crucial* to tests this integration point.
+
+````
+RUN: %Benchmark_O --list --skip-tags= --delim=$'\t' --verbose \
+RUN:              | %FileCheck %s --check-prefix LOGVERBOSEDELIM
+LOGVERBOSEDELIM: Delimiter: "\t"
+LOGVERBOSEDELIM: #	Test	[Tags]
+````
+
 ## Error Handling
 
 ````