diff --git a/.gitignore b/.gitignore index 74aa31bc37df7..05e03732a5215 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,4 @@ compile_commands.json SortedCFDatabase.def htmlcov .coverage +/benchmark/scripts/Benchmark_Driverc diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver index 92f12c627855a..eeac28fc226b5 100755 --- a/benchmark/scripts/Benchmark_Driver +++ b/benchmark/scripts/Benchmark_Driver @@ -14,136 +14,353 @@ # ===---------------------------------------------------------------------===// import argparse -import datetime import glob -import json +import logging +import math import os import re import subprocess import sys import time -import urllib -import urllib2 + +from compare_perf_tests import LogParser DRIVER_DIR = os.path.dirname(os.path.realpath(__file__)) -def parse_results(res, optset): - # Parse lines like this - # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),PEAK_MEMORY(B) - score_re = re.compile(r"(\d+),[ \t]*(\w+)," + - ",".join([r"[ \t]*([\d.]+)"] * 7)) - # The Totals line would be parsed like this. - total_re = re.compile(r"()(Totals)," + - ",".join([r"[ \t]*([\d.]+)"] * 7)) - key_group = 2 - val_group = 4 - mem_group = 9 - - tests = [] - for line in res.split(): - m = score_re.match(line) - if not m: - m = total_re.match(line) - if not m: - continue - testresult = int(m.group(val_group)) - testname = m.group(key_group) - test = {} - test['Data'] = [testresult] - test['Info'] = {} - test['Name'] = "nts.swift/" + optset + "." + testname + ".exec" - tests.append(test) - if testname != 'Totals': - mem_testresult = int(m.group(mem_group)) - mem_test = {} - mem_test['Data'] = [mem_testresult] - mem_test['Info'] = {} - mem_test['Name'] = "nts.swift/mem_maxrss." + \ - optset + "." + testname + ".mem" - tests.append(mem_test) - return tests - - -def submit_to_lnt(data, url): - print("\nSubmitting results to LNT server...") - json_report = {'input_data': json.dumps(data), 'commit': '1'} - data = urllib.urlencode(json_report) - response_str = urllib2.urlopen(urllib2.Request(url, data)) - response = json.loads(response_str.read()) - if 'success' in response: - print("Server response:\tSuccess") - else: - print("Server response:\tError") - print("Error:\t", response['error']) - sys.exit(1) - - -def instrument_test(driver_path, test, num_samples): - """Run a test and instrument its peak memory use""" - test_outputs = [] - for _ in range(num_samples): - test_output_raw = subprocess.check_output( - ['time', '-lp', driver_path, test], - stderr=subprocess.STDOUT - ) - peak_memory = re.match('\s*(\d+)\s*maximum resident set size', - test_output_raw.split('\n')[-15]).group(1) - test_outputs.append(test_output_raw.split()[1].split(',') + - [peak_memory]) - - # Average sample results - num_samples_index = 2 - min_index = 3 - max_index = 4 - avg_start_index = 5 - - # TODO: Correctly take stdev - avg_test_output = test_outputs[0] - avg_test_output[avg_start_index:] = map(int, - avg_test_output[avg_start_index:]) - for test_output in test_outputs[1:]: - for i in range(avg_start_index, len(test_output)): - avg_test_output[i] += int(test_output[i]) - for i in range(avg_start_index, len(avg_test_output)): - avg_test_output[i] = int(round(avg_test_output[i] / - float(len(test_outputs)))) - avg_test_output[num_samples_index] = num_samples - avg_test_output[min_index] = min( - test_outputs, key=lambda x: int(x[min_index]))[min_index] - avg_test_output[max_index] = max( - test_outputs, key=lambda x: int(x[max_index]))[max_index] - avg_test_output = map(str, avg_test_output) - - return avg_test_output - - -def get_tests(driver_path, args): - """Return a list of available performance tests""" - driver = ([driver_path, '--list']) - # Use tab delimiter for easier parsing to override the default comma. - # (The third 'column' is always comma-separated list of tags in square - # brackets -- currently unused here.) - driver.append('--delim=\t') - if args.benchmarks or args.filters: - driver.append('--skip-tags=') # list all tests, don't skip any tags - index_name_pairs = [ - line.split('\t')[:2] for line in - subprocess.check_output(driver).split('\n')[1:-1] - ] - indices, names = zip(*index_name_pairs) # unzip list of pairs into 2 lists - if args.filters: - regexes = [re.compile(pattern) for pattern in args.filters] +class BenchmarkDriver(object): + """Executes tests from Swift Benchmark Suite.""" + + def __init__(self, args, tests=None, _subprocess=None, parser=None): + """Initialized with command line arguments. + + Optional parameters for injecting dependencies; used for testing. + """ + self.args = args + self._subprocess = _subprocess or subprocess + self.all_tests = [] + self.tests = tests or self._get_tests() + self.parser = parser or LogParser() + self.results = {} + + def _invoke(self, cmd): + return self._subprocess.check_output( + cmd, stderr=self._subprocess.STDOUT) + + @property + def test_harness(self): + """Full path to test harness binary.""" + suffix = (self.args.optimization if hasattr(self.args, 'optimization') + else 'O') + return os.path.join(self.args.tests, "Benchmark_" + suffix) + + @property + def _cmd_list_benchmarks(self): + # Use tab delimiter for easier parsing to override the default comma. + # (The third 'column' is always comma-separated list of tags in square + # brackets -- currently unused here.) + return [self.test_harness, '--list', '--delim=\t'] + ( + ['--skip-tags='] if (self.args.benchmarks or + self.args.filters) else []) + + def _get_tests(self): + """Return a list of performance tests to run.""" + index_name_pairs = [ + line.split('\t')[:2] for line in + self._invoke(self._cmd_list_benchmarks).split('\n')[1:-1] + ] + # unzip list of pairs into 2 lists + indices, self.all_tests = map(list, zip(*index_name_pairs)) + if self.args.filters: + return self._tests_matching_patterns() + if self.args.benchmarks: + return self._tests_by_name_or_index(indices) + return self.all_tests + + def _tests_matching_patterns(self): + regexes = [re.compile(pattern) for pattern in self.args.filters] return sorted(list(set([name for pattern in regexes - for name in names if pattern.match(name)]))) - if not args.benchmarks: - return names - benchmarks = set(args.benchmarks) - index_to_name = dict(index_name_pairs) - indexed_names = [index_to_name[i] - for i in benchmarks.intersection(set(indices))] - return sorted(list( - benchmarks.intersection(set(names)).union(indexed_names))) + for name in self.all_tests + if pattern.match(name)]))) + + def _tests_by_name_or_index(self, indices): + benchmarks = set(self.args.benchmarks) + index_to_name = dict(zip(indices, self.all_tests)) + indexed_names = [index_to_name[i] + for i in benchmarks.intersection(set(indices))] + return sorted(list( + benchmarks.intersection(set(self.all_tests)).union(indexed_names))) + + def run(self, test, num_samples=None, num_iters=None, + verbose=None, measure_memory=False): + """Execute benchmark and gather results.""" + num_samples = num_samples or 1 + num_iters = num_iters or 0 # automatically determine N to run for 1s + + cmd = self._cmd_run( + test, num_samples, num_iters, verbose, measure_memory) + output = self._invoke(cmd) + result = self.parser.results_from_string(output).items()[0][1] + return result + + def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory): + cmd = [self.test_harness, test] + if num_samples > 1: + cmd.append('--num-samples={0}'.format(num_samples)) + if num_iters > 0: + cmd.append('--num-iters={0}'.format(num_iters)) + if verbose: + cmd.append('--verbose') + if measure_memory: + cmd.append('--memory') + return cmd + + def run_independent_samples(self, test): + """Run benchmark multiple times, gathering independent samples. + + Returns the aggregated result of independent benchmark invocations. + """ + def merge_results(a, b): + a.merge(b) + return a + + return reduce(merge_results, + [self.run(test, measure_memory=True) + for _ in range(self.args.iterations)]) + + +class LoggingReportFormatter(logging.Formatter): + """Format logs as plain text or with colors on the terminal. + + Plain text outputs level, category and massage: 'DEBUG category: Hi!' + Colored output uses color coding based on the level. + """ + + import logging as log + colors = {log.DEBUG: '9', log.INFO: '2', log.WARNING: '3', log.ERROR: '1', + log.CRITICAL: '5'} + + def __init__(self, use_color=False): + """Specify if report should use colors; defaults to False.""" + super(LoggingReportFormatter, self).__init__('%(message)s') + self.use_color = use_color + + def format(self, record): + """Format the log record with level and category.""" + msg = super(LoggingReportFormatter, self).format(record) + category = ((record.name.split('.')[-1] + ': ') if '.' in record.name + else '') + return ('\033[1;3{0}m{1}{2}\033[1;0m'.format( + self.colors[record.levelno], category, msg) if self.use_color else + '{0} {1}{2}'.format(record.levelname, category, msg)) + + +class BenchmarkDoctor(object): + """Checks that the benchmark conforms to the standard set of requirements. + + Benchmarks that are part of Swift Benchmark Suite are required to follow + a set of rules that ensure quality measurements. These include naming + convention, robustness when varying execution parameters like + `num-iters` and `num-samples` (no setup overhead, constant memory + consumption). + """ + + log = logging.getLogger('BenchmarkDoctor') + log_naming = log.getChild('naming') + log_runtime = log.getChild('runtime') + log_memory = log.getChild('memory') + log.setLevel(logging.DEBUG) + + def __init__(self, args, driver=None): + """Initialize with command line parameters. + + Optional `driver` parameter for injecting dependency; used for testing. + """ + super(BenchmarkDoctor, self).__init__() + self.driver = driver or BenchmarkDriver(args) + self.results = {} + self.console_handler = logging.StreamHandler(sys.stdout) + self.console_handler.setLevel(logging.DEBUG if args.verbose else + logging.INFO) + self.console_handler.setFormatter( + LoggingReportFormatter(use_color=sys.stdout.isatty())) + self.log.addHandler(self.console_handler) + self.log.debug('Checking tests: %s', ', '.join(self.driver.tests)) + self.requirements = [ + self._name_matches_capital_words_convention, + self._name_is_at_most_40_chars_long, + self._no_setup_overhead, + self._optimized_runtime_in_range, + self._constant_memory_use + ] + + def __del__(self): + """Unregister handler on exit.""" + self.log.removeHandler(self.console_handler) + + capital_words_re = re.compile('[A-Z][a-zA-Z0-9]+') + + @staticmethod + def _name_matches_capital_words_convention(measurements): + name = measurements['name'] + match = BenchmarkDoctor.capital_words_re.match(name) + matched = match.group(0) if match else '' + + if name != matched: + BenchmarkDoctor.log_naming.error( + "'%s' name doesn't conform to UpperCamelCase convention.", + name) + BenchmarkDoctor.log_naming.info( + 'See http://bit.ly/UpperCamelCase') + + @staticmethod + def _name_is_at_most_40_chars_long(measurements): + name = measurements['name'] + + if len(name) > 40: + BenchmarkDoctor.log_naming.error( + "'%s' name is %d characters long.", name, len(name)) + BenchmarkDoctor.log_naming.info( + 'Benchmark name should not be longer than 40 characters.') + + @staticmethod + def _select(measurements, num_iters=None, opt_level='O'): + prefix = measurements['name'] + ' ' + opt_level + prefix += '' if num_iters is None else (' i' + str(num_iters)) + return [series for name, series in measurements.items() + if name.startswith(prefix)] + + @staticmethod + def _optimized_runtime_in_range(measurements): + name = measurements['name'] + setup, ratio = BenchmarkDoctor._setup_overhead(measurements) + setup = 0 if ratio < 0.05 else setup + runtime = min( + [(result.min - correction) for i_series in + [BenchmarkDoctor._select(measurements, num_iters=i) + for correction in [(setup / i) for i in [1, 2]] + ] for result in i_series]) + + if 2500 < runtime: + log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else + BenchmarkDoctor.log_runtime.error) + caveat = '' if setup == 0 else ' (excluding the setup overhead)' + log("'%s' execution took at least %d μs%s.", name, runtime, caveat) + factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2)))) + BenchmarkDoctor.log_runtime.info( + "Decrease the workload of '%s' by a factor of %d, " + "to be less than 2500 μs.", name, factor) + + @staticmethod + def _setup_overhead(measurements): + select = BenchmarkDoctor._select + ti1, ti2 = [float(min(mins)) for mins in + [[result.min for result in i_series] for i_series in + [select(measurements, num_iters=i) for i in [1, 2]]]] + setup = int(round(2.0 * (ti1 - ti2))) + ratio = (setup / ti1) if ti1 > 0 else 0 + return (setup, ratio) + + @staticmethod + def _no_setup_overhead(measurements): + setup, ratio = BenchmarkDoctor._setup_overhead(measurements) + if ratio > 0.05: + BenchmarkDoctor.log_runtime.error( + "'%s' has setup overhead of %d μs (%.1f%%).", + measurements['name'], setup, round((100 * ratio), 1)) + BenchmarkDoctor.log_runtime.info( + 'Move initialization of benchmark data to the `setUpFunction` ' + 'registered in `BenchmarkInfo`.') + + @staticmethod + def _constant_memory_use(measurements): + select = BenchmarkDoctor._select + (min_i1, max_i1), (min_i2, max_i2) = [ + (min(memory_use), max(memory_use)) for memory_use in + [[r.mem_pages for r in i_series] for i_series in + [select(measurements, num_iters=i) for i in + [1, 2]]]] + range_i1, range_i2 = max_i1 - min_i1, max_i2 - min_i2 + normal_range = 15 # pages + name = measurements['name'] + + if abs(min_i1 - min_i2) > max(range_i1, range_i2, normal_range): + BenchmarkDoctor.log_memory.error( + "'%s' varies the memory footprint of the base " + "workload depending on the `num-iters`.", name) + + if max(range_i1, range_i2) > normal_range: + BenchmarkDoctor.log_memory.warning( + "'%s' has very wide range of memory used between " + "independent, repeated measurements.", name) + + BenchmarkDoctor.log_memory.debug( + "%s mem_pages [i1, i2]: min=[%d, %d] 𝚫=%d R=[%d, %d]", name, + *[min_i1, min_i2, abs(min_i1 - min_i2), range_i1, range_i2]) + + @staticmethod + def _adjusted_1s_samples(runtime): + u"""Return sample count that can be taken in approximately 1 second. + + Based on the runtime (μs) of one sample taken with num-iters=1. + """ + if runtime == 0: + return 2 + s = 1000000 / float(runtime) # samples for 1s run + s = int(pow(2, round(math.log(s, 2)))) # rounding to power of 2 + return s if s > 2 else 2 # always take at least 2 samples + + def measure(self, benchmark): + """Measure benchmark with varying iterations and optimization levels. + + Returns a dictionary with benchmark name and `PerformanceTestResult`s. + """ + self.log.debug('Calibrating num-samples for {0}:'.format(benchmark)) + r = self.driver.run(benchmark, num_samples=3, num_iters=1) # calibrate + num_samples = self._adjusted_1s_samples(r.min) + + def capped(s): + return min(s, 2048) + run_args = [(capped(num_samples), 1), (capped(num_samples / 2), 2)] + opts = self.driver.args.optimization + opts = opts if isinstance(opts, list) else [opts] + self.log.debug( + 'Runtime {0} μs yields {1} adjusted samples per second.'.format( + r.min, num_samples)) + self.log.debug( + 'Measuring {0}, 5 x i1 ({1} samples), 5 x i2 ({2} samples)'.format( + benchmark, run_args[0][0], run_args[1][0])) + + measurements = dict( + [('{0} {1} i{2}{3}'.format(benchmark, o, i, suffix), + self.driver.run(benchmark, num_samples=s, num_iters=i, + verbose=True)) + for o in opts + for s, i in run_args + for suffix in list('abcde') + ] + ) + measurements['name'] = benchmark + return measurements + + def analyze(self, benchmark_measurements): + """Analyze whether benchmark fullfills all requirtements.""" + self.log.debug('Analyzing %s', benchmark_measurements['name']) + for rule in self.requirements: + rule(benchmark_measurements) + + def check(self): + """Measure and analyse all enabled tests.""" + for test in self.driver.tests: + self.analyze(self.measure(test)) + + @staticmethod + def run_check(args): + doctor = BenchmarkDoctor(args) + doctor.check() + # TODO non-zero error code when errors are logged + # See https://stackoverflow.com/a/31142078/41307 + return 0 def get_current_git_branch(git_repo_path): @@ -190,11 +407,10 @@ def log_results(log_directory, driver, formatted_output, swift_repo=None): f.write(formatted_output) -def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False, +def run_benchmarks(driver, log_directory=None, swift_repo=None): """Run perf tests individually and return results in a format that's - compatible with `parse_results`. If `benchmarks` is not empty, - only run tests included in it. + compatible with `LogParser`. """ # Set a constant hash seed. Some tests are currently sensitive to # fluctuations in the number of hash collisions. @@ -203,88 +419,43 @@ def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False, # that runs the tests. os.environ["SWIFT_DETERMINISTIC_HASHING"] = "1" - (total_tests, total_min, total_max, total_mean) = (0, 0, 0, 0) output = [] headings = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)', 'MEAN(μs)', 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)'] line_format = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}' - if verbose and log_directory: + if log_directory: print(line_format.format(*headings)) - for test in benchmarks: - test_output = instrument_test(driver, test, num_samples) - if test_output[0] == 'Totals': - continue - if verbose: - if log_directory: - print(line_format.format(*test_output)) - else: - print(','.join(test_output)) + else: + print(','.join(headings)) + for test in driver.tests: + r = driver.run_independent_samples(test) + test_output = map(str, [ + r.test_num, r.name, r.num_samples, r.min, r.max, int(r.mean), + int(r.sd), r.median, r.max_rss]) + if log_directory: + print(line_format.format(*test_output)) + else: + print(','.join(test_output)) output.append(test_output) - (samples, _min, _max, mean) = map(int, test_output[2:6]) - total_tests += 1 - total_min += _min - total_max += _max - total_mean += mean if not output: return formatted_output = '\n'.join([','.join(l) for l in output]) - totals = map(str, ['Totals', total_tests, total_min, total_max, - total_mean, '0', '0', '0']) + totals = ['Totals', str(len(driver.tests))] totals_output = '\n\n' + ','.join(totals) - if verbose: - if log_directory: - print(line_format.format(*([''] + totals))) - else: - print(totals_output[1:]) + if log_directory: + print(line_format.format(*([''] + totals + ([''] * 6)))) + else: + print(totals_output[1:]) formatted_output += totals_output if log_directory: - log_results(log_directory, driver, formatted_output, swift_repo) + log_results(log_directory, driver.test_harness, formatted_output, + swift_repo) return formatted_output -def submit(args): - print("SVN revision:\t", args.revision) - print("Machine name:\t", args.machine) - print("Iterations:\t", args.iterations) - print("Optimizations:\t", ','.join(args.optimization)) - print("LNT host:\t", args.lnt_host) - starttime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') - print("Start time:\t", starttime) - data = {} - data['Tests'] = [] - data['Machine'] = {'Info': {'name': args.machine}, 'Name': args.machine} - print("\nRunning benchmarks...") - for optset in args.optimization: - print("Opt level:\t", optset) - file = os.path.join(args.tests, "Benchmark_" + optset) - try: - res = run_benchmarks( - file, benchmarks=get_tests(file, args), - num_samples=args.iterations) - data['Tests'].extend(parse_results(res, optset)) - except subprocess.CalledProcessError as e: - print("Execution failed.. Test results are empty.") - print("Process output:\n", e.output) - - endtime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') - data['Run'] = {'End Time': endtime, - 'Info': {'inferred_run_order': str(args.revision), - 'run_order': str(args.revision), - 'tag': 'nts', - 'test_suite_revision': 'None'}, - 'Start Time': starttime} - print("End time:\t", endtime) - - submit_to_lnt(data, args.lnt_host) - return 0 - - def run(args): - optset = args.optimization - file = os.path.join(args.tests, "Benchmark_" + optset) run_benchmarks( - file, benchmarks=get_tests(file, args), - num_samples=args.iterations, verbose=True, + BenchmarkDriver(args), log_directory=args.output_dir, swift_repo=args.swift_repo) return 0 @@ -392,16 +563,17 @@ def positive_int(value): return ivalue -def main(): +def parse_args(args): + """Parse command line arguments and set default values.""" parser = argparse.ArgumentParser( epilog='Example: ./Benchmark_Driver run -i 5 -f Prefix -f .*Suffix.*' ) subparsers = parser.add_subparsers( title='Swift benchmark driver commands', - help='See COMMAND -h for additional arguments', metavar='') + help='See COMMAND -h for additional arguments', metavar='COMMAND') - parent_parser = argparse.ArgumentParser(add_help=False) - benchmarks_group = parent_parser.add_mutually_exclusive_group() + shared_benchmarks_parser = argparse.ArgumentParser(add_help=False) + benchmarks_group = shared_benchmarks_parser.add_mutually_exclusive_group() benchmarks_group.add_argument( 'benchmarks', default=[], @@ -410,45 +582,22 @@ def main(): '-f', '--filter', dest='filters', action='append', help='run all tests whose name match regular expression PATTERN, ' + 'multiple filters are supported', metavar="PATTERN") - parent_parser.add_argument( + shared_benchmarks_parser.add_argument( '-t', '--tests', help='directory containing Benchmark_O{,none,size} ' + '(default: DRIVER_DIR)', default=DRIVER_DIR) - - submit_parser = subparsers.add_parser( - 'submit', - help='Run benchmarks and submit results to LNT', - parents=[parent_parser]) - submit_parser.add_argument( - '-o', '--optimization', nargs='+', - help='optimization levels to use (default: O Onone Osize)', - default=['O', 'Onone', 'Osize']) - submit_parser.add_argument( - '-i', '--iterations', - help='number of times to run each test (default: 10)', - type=positive_int, default=10) - submit_parser.add_argument( - '-m', '--machine', required=True, - help='LNT machine name') - submit_parser.add_argument( - '-r', '--revision', required=True, - help='SVN revision of compiler to identify the LNT run', type=int) - submit_parser.add_argument( - '-l', '--lnt_host', required=True, - help='LNT host to submit results to') - submit_parser.set_defaults(func=submit) - - run_parser = subparsers.add_parser( - 'run', - help='Run benchmarks and output results to stdout', - parents=[parent_parser]) - run_parser.add_argument( + shared_benchmarks_parser.add_argument( '-o', '--optimization', metavar='OPT', choices=['O', 'Onone', 'Osize'], help='optimization level to use: {O,Onone,Osize}, (default: O)', default='O') + + run_parser = subparsers.add_parser( + 'run', + help='Run benchmarks and output results to stdout', + parents=[shared_benchmarks_parser]) run_parser.add_argument( '-i', '--iterations', help='number of times to run each test (default: 1)', @@ -458,9 +607,18 @@ def main(): help='log results to directory (default: no logging)') run_parser.add_argument( '--swift-repo', - help='absolute path to Swift source repo for branch comparison') + help='absolute path to the Swift source repository') run_parser.set_defaults(func=run) + check_parser = subparsers.add_parser( + 'check', + help='', + parents=[shared_benchmarks_parser]) + check_parser.add_argument( + '-v', '--verbose', action='store_true', + help='show more details during benchmark analysis',) + check_parser.set_defaults(func=BenchmarkDoctor.run_check) + compare_parser = subparsers.add_parser( 'compare', help='Compare benchmark results') @@ -469,7 +627,7 @@ def main(): help='directory containing benchmark logs') compare_parser.add_argument( '--swift-repo', required=True, - help='absolute path to Swift source repo') + help='absolute path to the Swift source repository') compare_parser.add_argument( '--compare-script', required=True, help='absolute path to compare script') @@ -479,9 +637,11 @@ def main(): 'branch (default: master)') compare_parser.set_defaults(func=compare) - args = parser.parse_args() - if args.func != compare and isinstance(args.optimization, list): - args.optimization = sorted(list(set(args.optimization))) + return parser.parse_args(args) + + +def main(): + args = parse_args(sys.argv[1:]) return args.func(args) diff --git a/benchmark/scripts/CMakeLists.txt b/benchmark/scripts/CMakeLists.txt index 0fa44ab38d750..4922f6dd84031 100644 --- a/benchmark/scripts/CMakeLists.txt +++ b/benchmark/scripts/CMakeLists.txt @@ -42,3 +42,8 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/Benchmark_Driver DESTINATION "${swift-bin-dir}" FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/compare_perf_tests.py + DESTINATION "${swift-bin-dir}" + FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ + GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py index 2865df59bd116..c2d5388d93830 100755 --- a/benchmark/scripts/compare_perf_tests.py +++ b/benchmark/scripts/compare_perf_tests.py @@ -12,61 +12,155 @@ # See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors # # ===---------------------------------------------------------------------===// +""" +This script is used for comparing performance test results. +It is structured into several classes that can be imported into other modules. +""" from __future__ import print_function import argparse -import csv +import re import sys +from bisect import bisect, bisect_left, bisect_right +from collections import namedtuple from math import sqrt -class PerformanceTestResult(object): - """PerformanceTestResult holds results from executing an individual - benchmark from the Swift Benchmark Suite as reported by the test driver - (Benchmark_O, Benchmark_Onone, Benchmark_Osize or Benchmark_Driver). +class Sample(namedtuple('Sample', 'i num_iters runtime')): + u"""Single benchmark measurement. - It depends on the log format emitted by the test driver in the form: - #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B) - - The last column, MAX_RSS, is emitted only for runs instrumented by the - Benchmark_Driver to measure rough memory use during the execution of the - benchmark. + Initialized with: + `i`: ordinal number of the sample taken, + `num-num_iters`: number or iterations used to compute it, + `runtime`: in microseconds (μs). """ - def __init__(self, csv_row): - """PerformanceTestResult instance is created from an iterable with - length of 8 or 9. (Like a row provided by the CSV parser.) - """ - # csv_row[0] is just an ordinal number of the test - skip that - self.name = csv_row[1] # Name of the performance test - self.samples = int(csv_row[2]) # Number of measurement samples taken - self.min = int(csv_row[3]) # Minimum runtime (ms) - self.max = int(csv_row[4]) # Maximum runtime (ms) - self.mean = int(csv_row[5]) # Mean (average) runtime (ms) - sd = int(csv_row[6]) # Standard Deviation (ms) - # For computing running variance - self.S_runtime = (0 if self.samples < 2 else - (sd * sd) * (self.samples - 1)) - self.median = int(csv_row[7]) # Median runtime (ms) - self.max_rss = ( # Maximum Resident Set Size (B) - int(csv_row[8]) if len(csv_row) > 8 else None) def __repr__(self): + """Shorter Sample formating for debugging purposes.""" + return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self) + + +class PerformanceTestSamples(object): + """Collection of runtime samples from the benchmark execution. + + Computes the sample population statistics. + """ + + def __init__(self, name, samples=None): + """Initialized with benchmark name and optional list of Samples.""" + self.name = name # Name of the performance test + self.samples = [] + self.outliers = [] + self._runtimes = [] + self.mean = 0.0 + self.S_runtime = 0.0 # For computing running variance + for sample in samples or []: + self.add(sample) + + def __str__(self): + """Text summary of benchmark statisctics.""" return ( - ''.format(self)) + '{0.name!s} n={0.count!r} ' + 'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} ' + 'Max={0.max!r} ' + 'R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} ' + 'Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}' + .format(self) if self.samples else + '{0.name!s} n=0'.format(self)) + + def add(self, sample): + """Add sample to collection and recompute statistics.""" + assert isinstance(sample, Sample) + self._update_stats(sample) + i = bisect(self._runtimes, sample.runtime) + self._runtimes.insert(i, sample.runtime) + self.samples.insert(i, sample) + + def _update_stats(self, sample): + old_stats = (self.count, self.mean, self.S_runtime) + _, self.mean, self.S_runtime = ( + self.running_mean_variance(old_stats, sample.runtime)) + + def exclude_outliers(self, top_only=False): + """Exclude outliers by applying Interquartile Range Rule. + + Moves the samples outside of the inner fences + (Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes + statistics for the remaining sample population. Optionally apply + only the top inner fence, preserving the small outliers. + + Experimentally, this rule seems to perform well-enough on the + benchmark runtimes in the microbenchmark range to filter out + the environment noise caused by preemtive multitasking. + """ + lo = (0 if top_only else + bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))) + hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr)) + + outliers = self.samples[:lo] + self.samples[hi:] + samples = self.samples[lo:hi] + + self.__init__(self.name) # re-initialize + for sample in samples: # and + self.add(sample) # re-compute stats + self.outliers = outliers + + @property + def count(self): + """Number of samples used to compute the statistics.""" + return len(self.samples) + + @property + def num_samples(self): + """Number of all samples in the collection.""" + return len(self.samples) + len(self.outliers) + + @property + def all_samples(self): + """List of all samples in ascending order.""" + return sorted(self.samples + self.outliers, key=lambda s: s.i) + + @property + def min(self): + """Minimum sampled value.""" + return self.samples[0].runtime + + @property + def max(self): + """Maximum sampled value.""" + return self.samples[-1].runtime + + @property + def median(self): + """Median sampled value.""" + return self.samples[self.count / 2].runtime + + @property + def q1(self): + """First Quartile (25th Percentile).""" + return self.samples[self.count / 4].runtime + + @property + def q3(self): + """Third Quartile (75th Percentile).""" + return self.samples[(self.count / 2) + (self.count / 4)].runtime + + @property + def iqr(self): + """Interquartile Range.""" + return self.q3 - self.q1 @property def sd(self): - """Standard Deviation (ms)""" - return (0 if self.samples < 2 else - sqrt(self.S_runtime / (self.samples - 1))) + u"""Standard Deviation (μs).""" + return (0 if self.count < 2 else + sqrt(self.S_runtime / (self.count - 1))) @staticmethod def running_mean_variance((k, M_, S_), x): - """ - Compute running variance, B. P. Welford's method + """Compute running variance, B. P. Welford's method. + See Knuth TAOCP vol 2, 3rd edition, page 232, or https://www.johndcook.com/blog/standard_deviation/ M is mean, Standard Deviation is defined as sqrt(S/k-1) @@ -76,50 +170,98 @@ def running_mean_variance((k, M_, S_), x): S = S_ + (x - M_) * (x - M) return (k, M, S) - def merge(self, r): - """Merging test results recomputes min and max. - It attempts to recompute mean and standard deviation when all_samples - are available. There is no correct way to compute these values from - test results that are summaries from more than 3 samples. + @property + def cv(self): + """Coeficient of Variation (%).""" + return (self.sd / self.mean) if self.mean else 0 - The use case here is comparing tests results parsed from concatenated - log files from multiple runs of benchmark driver. - """ - self.min = min(self.min, r.min) - self.max = max(self.max, r.max) - # self.median = None # unclear what to do here + @property + def range(self): + """Range of samples values (Max - Min).""" + return self.max - self.min - def push(x): - state = (self.samples, self.mean, self.S_runtime) - state = self.running_mean_variance(state, x) - (self.samples, self.mean, self.S_runtime) = state + @property + def spread(self): + """Sample Spread; i.e. Range as (%) of Min.""" + return self.range / float(self.min) if self.min else 0 - # Merging test results with up to 3 samples is exact - values = [r.min, r.max, r.median][:min(r.samples, 3)] - map(push, values) - # Column labels for header row in results table - header = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS') +class PerformanceTestResult(object): + u"""Result from executing an individual Swift Benchmark Suite benchmark. + + Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize + or Benchmark_Driver). + + It depends on the log format emitted by the test driver in the form: + #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B) - def values(self): - """Values property for display in results table comparisons - in format: ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS'). + The last column, MAX_RSS, is emitted only for runs instrumented by the + Benchmark_Driver to measure rough memory use during the execution of the + benchmark. + """ + + def __init__(self, csv_row): + """Initialized from a row with 8 or 9 columns with benchmark summary. + + The row is an iterable, such as a row provided by the CSV parser. """ + self.test_num = csv_row[0] # Ordinal number of the test + self.name = csv_row[1] # Name of the performance test + self.num_samples = ( # Number of measurement samples taken + int(csv_row[2])) + self.min = int(csv_row[3]) # Minimum runtime (μs) + self.max = int(csv_row[4]) # Maximum runtime (μs) + self.mean = float(csv_row[5]) # Mean (average) runtime (μs) + self.sd = float(csv_row[6]) # Standard Deviation (μs) + self.median = int(csv_row[7]) # Median runtime (μs) + self.max_rss = ( # Maximum Resident Set Size (B) + int(csv_row[8]) if len(csv_row) > 8 else None) + self.samples = None + + def __repr__(self): + """Short summary for debugging purposes.""" return ( - self.name, - str(self.min), str(self.max), str(int(self.mean)), - str(self.max_rss) if self.max_rss else '—' - ) + '' + .format(self)) + + def merge(self, r): + """Merge two results. + + Recomputes min, max and mean statistics. If all `samples` are + avaliable, it recomputes all the statistics. + The use case here is comparing test results parsed from concatenated + log files from multiple runs of benchmark driver. + """ + if self.samples and r.samples: + map(self.samples.add, r.samples.samples) + sams = self.samples + self.num_samples = sams.num_samples + self.min, self.max, self.median, self.mean, self.sd = \ + sams.min, sams.max, sams.median, sams.mean, sams.sd + else: + self.min = min(self.min, r.min) + self.max = max(self.max, r.max) + self.mean = ( # pooled mean is the weighted sum of means + (self.mean * self.num_samples) + (r.mean * r.num_samples) + ) / float(self.num_samples + r.num_samples) + self.num_samples += r.num_samples + self.max_rss = min(self.max_rss, r.max_rss) + self.median, self.sd = 0, 0 class ResultComparison(object): """ResultComparison compares MINs from new and old PerformanceTestResult. + It computes speedup ratio and improvement delta (%). """ + def __init__(self, old, new): + """Initialize with old and new `PerformanceTestResult`s to compare.""" self.old = old self.new = new - assert(old.name == new.name) + assert old.name == new.name self.name = old.name # Test name, convenience accessor # Speedup ratio @@ -129,56 +271,149 @@ def __init__(self, old, new): ratio = (new.min + 0.001) / (old.min + 0.001) self.delta = ((ratio - 1) * 100) - # Add ' (?)' to the speedup column as indication of dubious changes: - # result's MIN falls inside the (MIN, MAX) interval of result they are - # being compared with. - self.is_dubious = ( - ' (?)' if ((old.min < new.min and new.min < old.max) or - (new.min < old.min and old.min < new.max)) - else '') + # Indication of dubious changes: when result's MIN falls inside the + # (MIN, MAX) interval of result they are being compared with. + self.is_dubious = ((old.min < new.min and new.min < old.max) or + (new.min < old.min and old.min < new.max)) + + +class LogParser(object): + """Converts log outputs into `PerformanceTestResult`s. + + Supports various formats produced by the `Benchmark_Driver` and + `Benchmark_O`('Onone', 'Osize'). It can also merge together the + results from concatenated log files. + """ + + def __init__(self): + """Create instance of `LogParser`.""" + self.results = [] + self._reset() + + def _reset(self): + """Reset parser to the default state for reading a new result.""" + self.samples, self.num_iters = [], 1 + self.max_rss, self.mem_pages = None, None + self.voluntary_cs, self.involuntary_cs = None, None + + # Parse lines like this + # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs) + results_re = re.compile(r'(\d+[, \t]*\w+[, \t]*' + + r'[, \t]*'.join([r'[\d.]+'] * 6) + + r'[, \t]*[\d.]*)') # optional MAX_RSS(B) + + def _append_result(self, result): + columns = result.split(',') + if len(columns) < 8: + columns = result.split() + r = PerformanceTestResult(columns) + if self.max_rss: + r.max_rss = self.max_rss + r.mem_pages = self.mem_pages + r.voluntary_cs = self.voluntary_cs + r.involuntary_cs = self.involuntary_cs + if self.samples: + r.samples = PerformanceTestSamples(r.name, self.samples) + r.samples.exclude_outliers() + self.results.append(r) + self._reset() + + def _store_memory_stats(self, max_rss, mem_pages): + self.max_rss = int(max_rss) + self.mem_pages = int(mem_pages) + + # Regular expression and action to take when it matches the parsed line + state_actions = { + results_re: _append_result, + + # Verbose mode adds new productions: + # Adaptively determined N; test loop multiple adjusting runtime to ~1s + re.compile(r'\s+Measuring with scale (\d+).'): + (lambda self, num_iters: setattr(self, 'num_iters', num_iters)), + + re.compile(r'\s+Sample (\d+),(\d+)'): + (lambda self, i, runtime: + self.samples.append( + Sample(int(i), int(self.num_iters), int(runtime)))), + + # Environmental statistics: memory usage and context switches + re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'): + _store_memory_stats, + + re.compile(r'\s+VCS \d+ - \d+ = (\d+)'): + (lambda self, vcs: setattr(self, 'voluntary_cs', int(vcs))), + + re.compile(r'\s+ICS \d+ - \d+ = (\d+)'): + (lambda self, ics: setattr(self, 'involuntary_cs', int(ics))), + } + + def parse_results(self, lines): + """Parse results from the lines of the log output from Benchmark*. - # Column labels for header row in results table - header = ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP') + Returns a list of `PerformanceTestResult`s. + """ + for line in lines: + for regexp, action in LogParser.state_actions.items(): + match = regexp.match(line) + if match: + action(self, *match.groups()) + break # stop after 1st match + else: # If none matches, skip the line. + # print('skipping: ' + line.rstrip('\n')) + continue + return self.results + + @staticmethod + def _results_from_lines(lines): + tests = LogParser().parse_results(lines) + + def add_or_merge(names, r): + if r.name not in names: + names[r.name] = r + else: + names[r.name].merge(r) + return names + + return reduce(add_or_merge, tests, dict()) + + @staticmethod + def results_from_string(log_contents): + """Parse `PerformanceTestResult`s from the supplied string. - def values(self): - """Values property for display in results table comparisons - in format: ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP'). + Returns dictionary of test names and `PerformanceTestResult`s. """ - return (self.name, - str(self.old.min), str(self.new.min), - '{0:+.1f}%'.format(self.delta), - '{0:.2f}x{1}'.format(self.ratio, self.is_dubious)) + return LogParser._results_from_lines(log_contents.splitlines()) + + @staticmethod + def results_from_file(log_file): + """Parse `PerformanceTestResult`s from the log file. + + Returns dictionary of test names and `PerformanceTestResult`s. + """ + with open(log_file) as f: + return LogParser._results_from_lines(f.readlines()) class TestComparator(object): - """TestComparator parses `PerformanceTestResult`s from CSV log files. - Then it determines which tests were `added`, `removed` and which can be + """Analyzes changes betweeen the old and new test results. + + It determines which tests were `added`, `removed` and which can be compared. It then splits the `ResultComparison`s into 3 groups according to the `delta_threshold` by the change in performance: `increased`, - `descreased` and `unchanged`. + `descreased` and `unchanged`. Whole computaion is performed during + initialization and results are provided as properties on this object. The lists of `added`, `removed` and `unchanged` tests are sorted alphabetically. The `increased` and `decreased` lists are sorted in descending order by the amount of change. """ - def __init__(self, old_file, new_file, delta_threshold): - - def load_from_CSV(filename): # handles output from Benchmark_O and - def skip_totals(row): # Benchmark_Driver (added MAX_RSS column) - return len(row) > 7 and row[0].isdigit() - tests = map(PerformanceTestResult, - filter(skip_totals, csv.reader(open(filename)))) - - def add_or_merge(names, r): - if r.name not in names: - names[r.name] = r - else: - names[r.name].merge(r) - return names - return reduce(add_or_merge, tests, dict()) - - old_results = load_from_CSV(old_file) - new_results = load_from_CSV(new_file) + + def __init__(self, old_results, new_results, delta_threshold): + """Initialized with dictionaries of old and new benchmark results. + + Dictionary keys are benchmark names, values are + `PerformanceTestResult`s. + """ old_tests = set(old_results.keys()) new_tests = set(new_results.keys()) comparable_tests = new_tests.intersection(old_tests) @@ -215,13 +450,17 @@ def partition(l, p): class ReportFormatter(object): - """ReportFormatter formats the `PerformanceTestResult`s and - `ResultComparison`s provided by `TestComparator` using their `header` and - `values()` into report table. Supported formats are: `markdown` (used for - displaying benchmark results on GitHub), `git` and `html`. + """Creates the report from perfromance test comparison in specified format. + + `ReportFormatter` formats the `PerformanceTestResult`s and + `ResultComparison`s provided by `TestComparator` into report table. + Supported formats are: `markdown` (used for displaying benchmark results on + GitHub), `git` and `html`. """ + def __init__(self, comparator, old_branch, new_branch, changes_only, single_table=False): + """Initialize with `TestComparator` and names of branches.""" self.comparator = comparator self.old_branch = old_branch self.new_branch = new_branch @@ -237,13 +476,45 @@ def __init__(self, comparator, old_branch, new_branch, changes_only, GIT_DETAIL = """ {0} ({1}): {2}""" + PERFORMANCE_TEST_RESULT_HEADER = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS') + RESULT_COMPARISON_HEADER = ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP') + + @staticmethod + def header_for(result): + """Column labels for header row in results table.""" + return (ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER + if isinstance(result, PerformanceTestResult) else + # isinstance(result, ResultComparison) + ReportFormatter.RESULT_COMPARISON_HEADER) + + @staticmethod + def values(result): + """Format values from PerformanceTestResult or ResultComparison. + + Returns tuple of strings to display in the results table. + """ + return ( + (result.name, + str(result.min), str(result.max), str(int(result.mean)), + str(result.max_rss) if result.max_rss else '—') + if isinstance(result, PerformanceTestResult) else + # isinstance(result, ResultComparison) + (result.name, + str(result.old.min), str(result.new.min), + '{0:+.1f}%'.format(result.delta), + '{0:.2f}x{1}'.format(result.ratio, + ' (?)' if result.is_dubious else '')) + ) + def markdown(self): + """Report results of benchmark comparisons in Markdown format.""" return self._formatted_text( ROW='{0} | {1} | {2} | {3} | {4} \n', HEADER_SEPARATOR='---', DETAIL=self.MARKDOWN_DETAIL) def git(self): + """Report results of benchmark comparisons in 'git' format.""" return self._formatted_text( ROW='{0} {1} {2} {3} {4} \n', HEADER_SEPARATOR=' ', @@ -251,14 +522,15 @@ def git(self): def _column_widths(self): changed = self.comparator.decreased + self.comparator.increased - comparisons = (changed if self.changes_only else - changed + self.comparator.unchanged) - comparisons += self.comparator.added + self.comparator.removed + results = (changed if self.changes_only else + changed + self.comparator.unchanged) + results += self.comparator.added + self.comparator.removed widths = [ map(len, columns) for columns in - [PerformanceTestResult.header, ResultComparison.header] + - [c.values() for c in comparisons] + [ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER, + ReportFormatter.RESULT_COMPARISON_HEADER] + + [ReportFormatter.values(r) for r in results] ] def max_widths(maximum, widths): @@ -285,8 +557,8 @@ def format_columns(r, strong): def table(title, results, is_strong=False, is_open=False): rows = [ - row(format_columns(result_comparison.values(), is_strong)) - for result_comparison in results + row(format_columns(ReportFormatter.values(r), is_strong)) + for r in results ] if not rows: return '' @@ -294,7 +566,7 @@ def table(title, results, is_strong=False, is_open=False): if self.single_table: t = '' if not self.header_printed: - t += header(results[0].header) + t += header(ReportFormatter.header_for(results[0])) self.header_printed = True t += row(('**' + title + '**', '', '', '', '')) t += ''.join(rows) @@ -303,7 +575,8 @@ def table(title, results, is_strong=False, is_open=False): return DETAIL.format( *[ title, len(results), - (header(results[0].header) + ''.join(rows)), + (header(ReportFormatter.header_for(results[0])) + + ''.join(rows)), ('open' if is_open else '') ]) @@ -363,7 +636,7 @@ def table(title, results, is_strong=False, is_open=False): """ def html(self): - + """Report results of benchmark comparisons in HTML format.""" def row(name, old, new, delta, speedup, speedup_color): return self.HTML_ROW.format( name, old, new, delta, speedup_color, speedup) @@ -373,11 +646,12 @@ def header(contents): def table(title, results, speedup_color): rows = [ - row(*(result_comparison.values() + (speedup_color,))) - for result_comparison in results + row(*(ReportFormatter.values(r) + (speedup_color,))) + for r in results ] return ('' if not rows else - header((title, len(results)) + results[0].header[1:]) + + header((title, len(results)) + + ReportFormatter.header_for(results[0])[1:]) + ''.join(rows)) return self.HTML.format( @@ -423,8 +697,10 @@ def parse_args(args): def main(): + """Compare benchmarks for changes in a formatted report.""" args = parse_args(sys.argv[1:]) - comparator = TestComparator(args.old_file, args.new_file, + comparator = TestComparator(LogParser.results_from_file(args.old_file), + LogParser.results_from_file(args.new_file), args.delta_threshold) formatter = ReportFormatter(comparator, args.old_branch, args.new_branch, args.changes_only, args.single_table) diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py new file mode 100644 index 0000000000000..7bebca0847407 --- /dev/null +++ b/benchmark/scripts/test_Benchmark_Driver.py @@ -0,0 +1,567 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ===--- test_Benchmark_Driver.py ----------------------------------------===// +# +# This source file is part of the Swift.org open source project +# +# Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors +# Licensed under Apache License v2.0 with Runtime Library Exception +# +# See https://swift.org/LICENSE.txt for license information +# See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +# +# ===---------------------------------------------------------------------===// + +import logging +import os +import unittest + +from imp import load_source + +from compare_perf_tests import PerformanceTestResult + +from test_utils import Mock, MockLoggingHandler, Stub, captured_output + +# import Benchmark_Driver # doesn't work because it misses '.py' extension +Benchmark_Driver = load_source( + 'Benchmark_Driver', os.path.join(os.path.dirname( + os.path.abspath(__file__)), 'Benchmark_Driver')) +# from Benchmark_Driver import parse_args +parse_args = Benchmark_Driver.parse_args +BenchmarkDriver = Benchmark_Driver.BenchmarkDriver +BenchmarkDoctor = Benchmark_Driver.BenchmarkDoctor +LoggingReportFormatter = Benchmark_Driver.LoggingReportFormatter + + +class Test_parse_args(unittest.TestCase): + def assert_contains(self, texts, output): + assert not isinstance(texts, str) + for text in texts: + self.assertIn(text, output) + + def test_requires_command_argument(self): + with captured_output() as (_, err): + self.assertRaises(SystemExit, parse_args, []) + self.assert_contains(['usage:', 'COMMAND', 'too few arguments'], + err.getvalue()) + + def test_command_help_lists_commands(self): + with captured_output() as (out, _): + self.assertRaises(SystemExit, parse_args, ['-h']) + self.assert_contains(['COMMAND', 'run', 'compare', 'check'], + out.getvalue()) + + def test_run_benchmarks_by_name_or_ordinal(self): + benchmarks = ['AngryPhonebook', '42'] + self.assertEquals( + parse_args(['run'] + benchmarks).benchmarks, benchmarks) + + def test_run_benchmarks_matching_pattern(self): + regexes = ['Prefix', '.*Suffix.*'] + filters = ['-f', regexes[0], '-f', regexes[1]] + self.assertEquals(parse_args(['run'] + filters).filters, regexes) + + def test_run_benchmarks_and_filters_are_exclusive(self): + with captured_output() as (_, err): + self.assertRaises(SystemExit, + parse_args, 'run -f Filter1 Benchmark1'.split()) + self.assert_contains( + ['error', + 'argument BENCHMARK: not allowed with argument -f/--filter'], + err.getvalue()) + + def test_tests_location(self): + here = os.path.dirname(os.path.abspath(__file__)) + self.assertEquals(parse_args(['run']).tests, here) + tests = '/benchmarks/are/here' + self.assertEquals(parse_args(['run', '-t', tests]).tests, tests) + + def test_optimization_argument(self): + self.assertEquals(parse_args(['run']).optimization, 'O') + self.assertEquals( + parse_args(['run', '-o', 'O']).optimization, 'O') + self.assertEquals( + parse_args(['run', '-o', 'Onone']).optimization, 'Onone') + self.assertEquals( + parse_args(['run', '-o', 'Osize']).optimization, 'Osize') + + with captured_output() as (_, err): + self.assertRaises(SystemExit, + parse_args, ['run', '-o', 'bogus']) + self.assert_contains( + ['error:', + "argument -o/--optimization: invalid choice: 'bogus'", + "(choose from 'O', 'Onone', 'Osize')"], + err.getvalue()) + + def test_iterations(self): + self.assertEquals(parse_args(['run']).iterations, 1) + self.assertEquals(parse_args(['run', '-i', '3']).iterations, 3) + with captured_output() as (out, err): + self.assertRaises(SystemExit, + parse_args, ['run', '-i', '-3']) + self.assert_contains( + ['error:', + "argument -i/--iterations: invalid positive_int value: '-3'"], + err.getvalue()) + + def test_check_supports_vebose_output(self): + self.assertFalse(parse_args(['check']).verbose) + self.assertTrue(parse_args(['check', '-v']).verbose) + self.assertTrue(parse_args(['check', '--verbose']).verbose) + + +class ArgsStub(object): + def __init__(self): + self.benchmarks = None + self.filters = None + self.tests = '/benchmarks/' + self.optimization = 'O' + + +class SubprocessMock(Mock): + """Mock for subprocess module's `check_output` method.""" + STDOUT = object() + + def __init__(self, responses=None): + super(SubprocessMock, self).__init__() + + def _check_output(args, stdin=None, stdout=None, stderr=None, + shell=False): + return self.record_and_respond(args, stdin, stdout, stderr, shell) + self.check_output = _check_output + + def record_and_respond(self, args, stdin, stdout, stderr, shell): + # _ = stdin, stdout, shell # ignored in mock + assert stderr == self.STDOUT, 'Errors are NOT redirected to STDOUT' + args = tuple(args) + self.calls.append(args) + return self.respond.get(args, '') + + +class TestBenchmarkDriverInitialization(unittest.TestCase): + def setUp(self): + self.args = ArgsStub() + self.subprocess_mock = SubprocessMock() + + def test_test_harness(self): + self.assertEquals( + BenchmarkDriver(self.args, tests=['ignored']).test_harness, + '/benchmarks/Benchmark_O') + self.args.tests = '/path' + self.args.optimization = 'Suffix' + self.assertEquals( + BenchmarkDriver(self.args, tests=['ignored']).test_harness, + '/path/Benchmark_Suffix') + + def test_gets_list_of_precommit_benchmarks(self): + self.subprocess_mock.expect( + '/benchmarks/Benchmark_O --list --delim=\t'.split(' '), + '#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n1\tBenchmark2\t[t3]\n') + driver = BenchmarkDriver( + self.args, _subprocess=self.subprocess_mock) + self.subprocess_mock.assert_called_all_expected() + self.assertEquals(driver.tests, + ['Benchmark1', 'Benchmark2']) + self.assertEquals(driver.all_tests, + ['Benchmark1', 'Benchmark2']) + + list_all_tests = ( + '/benchmarks/Benchmark_O --list --delim=\t --skip-tags='.split(' '), + """# Test [Tags] +1 Benchmark1 [t1, t2] +2 Benchmark2 [t3] +3 Benchmark3 [t3, t4] +""") + + def test_gets_list_of_all_benchmarks_when_benchmarks_args_exist(self): + """Filters tests by name or test number, ignoring unknown.""" + self.args.benchmarks = '1 Benchmark3 1 bogus'.split() + self.subprocess_mock.expect(*self.list_all_tests) + driver = BenchmarkDriver( + self.args, _subprocess=self.subprocess_mock) + self.subprocess_mock.assert_called_all_expected() + self.assertEquals(driver.tests, ['Benchmark1', 'Benchmark3']) + self.assertEquals(driver.all_tests, + ['Benchmark1', 'Benchmark2', 'Benchmark3']) + + def test_filters_benchmarks_by_pattern(self): + self.args.filters = '-f .+3'.split() + self.subprocess_mock.expect(*self.list_all_tests) + driver = BenchmarkDriver( + self.args, _subprocess=self.subprocess_mock) + self.subprocess_mock.assert_called_all_expected() + self.assertEquals(driver.tests, ['Benchmark3']) + self.assertEquals(driver.all_tests, + ['Benchmark1', 'Benchmark2', 'Benchmark3']) + + +class LogParserStub(object): + results_from_string_called = False + + @staticmethod + def results_from_string(log_contents): + LogParserStub.results_from_string_called = True + r = PerformanceTestResult('3,b1,1,123,123,123,0,123'.split(',')) + return {'b1': r} + + +class TestBenchmarkDriverRunningTests(unittest.TestCase): + def setUp(self): + self.args = ArgsStub() + self.parser_stub = LogParserStub() + self.subprocess_mock = SubprocessMock() + self.subprocess_mock.expect( + '/benchmarks/Benchmark_O --list --delim=\t'.split(' '), + '#\tTest\t[Tags]\n1\tb1\t[tag]\n') + self.driver = BenchmarkDriver( + self.args, _subprocess=self.subprocess_mock, + parser=self.parser_stub) + + def test_run_benchmark_with_multiple_samples(self): + self.driver.run('b1') + self.subprocess_mock.assert_called_with( + ('/benchmarks/Benchmark_O', 'b1')) + self.driver.run('b2', num_samples=5) + self.subprocess_mock.assert_called_with( + ('/benchmarks/Benchmark_O', 'b2', '--num-samples=5')) + + def test_run_benchmark_with_specified_number_of_iterations(self): + self.driver.run('b', num_iters=1) + self.subprocess_mock.assert_called_with( + ('/benchmarks/Benchmark_O', 'b', '--num-iters=1')) + + def test_run_benchmark_in_verbose_mode(self): + self.driver.run('b', verbose=True) + self.subprocess_mock.assert_called_with( + ('/benchmarks/Benchmark_O', 'b', '--verbose')) + + def test_parse_results_from_running_benchmarks(self): + self.driver.run('b') + self.assertTrue(self.parser_stub.results_from_string_called) + + def test_measure_memory(self): + self.driver.run('b', measure_memory=True) + self.subprocess_mock.assert_called_with( + ('/benchmarks/Benchmark_O', 'b', '--memory')) + + def test_run_benchmark_independent_samples(self): + self.driver.args.iterations = 3 + r = self.driver.run_independent_samples('b1') + self.assertEquals(self.subprocess_mock.calls.count( + ('/benchmarks/Benchmark_O', 'b1', '--memory')), 3) + self.assertEquals(r.num_samples, 3) # results are merged + + def test_run_bechmarks(self): + def mock_run(test): + self.assertEquals(test, 'b1') + return PerformanceTestResult( + '3,b1,1,123,123,123,0,123,888'.split(',')) + driver = Stub(tests=['b1']) + driver.run_independent_samples = mock_run + run_benchmarks = Benchmark_Driver.run_benchmarks + + with captured_output() as (out, _): + run_benchmarks(driver) + self.assertEquals('\n'.join(""" +#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B) +3,b1,1,123,123,123,0,123,888 + +Totals,1 + +""".splitlines()[1:]), out.getvalue()) # removes 1st \n from multiline string + + +class BenchmarkDriverMock(Mock): + """Mock for BenchmarkDriver's `run` method""" + def __init__(self, tests, responses=None): + super(BenchmarkDriverMock, self).__init__(responses) + self.tests = tests + self.args = ArgsStub() + + def _run(test, num_samples=None, num_iters=None, + verbose=None, measure_memory=False): + return self.record_and_respond(test, num_samples, num_iters, + verbose, measure_memory) + self.run = _run + + def record_and_respond(self, test, num_samples, num_iters, + verbose, measure_memory): + args = (test, num_samples, num_iters, verbose, measure_memory) + self.calls.append(args) + return self.respond.get(args, _PTR(min=700)) + + +class TestLoggingReportFormatter(unittest.TestCase): + def test_plain_log_format(self): + lr = logging.makeLogRecord({ + 'name': 'Base.category', 'level': logging.DEBUG, + 'levelname': 'DEBUG', 'msg': 'Hi!'}) + f = LoggingReportFormatter() + self.assertEquals(f.format(lr), 'DEBUG category: Hi!') + + def test_colored_log_format(self): + def record(level, level_name): + return logging.makeLogRecord({ + 'name': 'Base.category', 'levelno': level, + 'levelname': level_name, 'msg': 'Hi!'}) + f = LoggingReportFormatter(use_color=True) + self.assertEquals(f.format(record(logging.DEBUG, 'DEBUG')), + '\x1b[1;39mcategory: Hi!\x1b[1;0m') + self.assertEquals(f.format(record(logging.INFO, 'INFO')), + '\x1b[1;32mcategory: Hi!\x1b[1;0m') + self.assertEquals(f.format(record(logging.WARNING, 'WARNING')), + '\x1b[1;33mcategory: Hi!\x1b[1;0m') + self.assertEquals(f.format(record(logging.ERROR, 'ERROR')), + '\x1b[1;31mcategory: Hi!\x1b[1;0m') + self.assertEquals(f.format(record(logging.CRITICAL, 'CRITICAL')), + '\x1b[1;35mcategory: Hi!\x1b[1;0m') + + def test_no_prefix_for_base_logging(self): + lr = logging.makeLogRecord({ + 'name': 'Base', 'level': logging.INFO, + 'levelname': 'INFO', 'msg': 'Hi!'}) + f = LoggingReportFormatter() + self.assertEquals(f.format(lr), 'INFO Hi!') + + +def _PTR(min=700, mem_pages=1000): + """Create PerformanceTestResult Stub.""" + return Stub(min=min, mem_pages=mem_pages) + + +def _run(test, num_samples=None, num_iters=None, verbose=None, + measure_memory=False): + """Helper function that constructs tuple with arguments for run method.""" + return ( + test, num_samples, num_iters, verbose, measure_memory) + + +class TestBenchmarkDoctor(unittest.TestCase): + @classmethod + def setUpClass(cls): + super(TestBenchmarkDoctor, cls).setUpClass() + doctor_log = logging.getLogger('BenchmarkDoctor') + cls._doctor_log_handler = MockLoggingHandler(level='DEBUG') + doctor_log.addHandler(cls._doctor_log_handler) + + def setUp(self): + super(TestBenchmarkDoctor, self).setUp() + self.args = Stub(verbose=False) + self._doctor_log_handler.reset() + self.logs = self._doctor_log_handler.messages + + def assert_contains(self, texts, output): + assert not isinstance(texts, str) + for text in texts: + self.assertIn(text, output) + + def test_uses_logging(self): + driver = BenchmarkDriverMock(tests=['B1', 'B2']) + with captured_output() as (out, _): + BenchmarkDoctor(self.args, driver) + self.assert_contains(['Checking tests: B1, B2'], self.logs['debug']) + self.assertEquals(out.getvalue(), '') + + def test_supports_verbose_output(self): + driver = BenchmarkDriverMock(tests=['B1', 'B2']) + driver.verbose = True + with captured_output() as (out, _): + BenchmarkDoctor(Stub(verbose=True), driver) + self.assert_contains(['Checking tests: B1, B2'], out.getvalue()) + + def test_uses_report_formatter(self): + doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock(tests=['B1'])) + console_handler = logging.getLogger('BenchmarkDoctor').handlers[1] + self.assertTrue(doctor) + self.assertTrue(isinstance(console_handler, logging.StreamHandler)) + self.assertTrue(isinstance(console_handler.formatter, + LoggingReportFormatter)) + + def test_measure_10_independent_1s_benchmark_series(self): + """Measurement strategy takes 5 i2 and 5 i1 series. + + Num-samples for Benchmark Driver are calibrated to be powers of two, + take measurements for approximately 1s + based on short initial runtime sampling. Capped at 2k samples. + """ + driver = BenchmarkDriverMock(tests=['B1'], responses=([ + # calibration run, returns a stand-in for PerformanceTestResult + (_run('B1', num_samples=3, num_iters=1), _PTR(min=300))] + + # 5x i1 series, with 300 μs runtime its possible to take 4098 + # samples/s, but it should be capped at 2k + ([(_run('B1', num_samples=2048, num_iters=1, + verbose=True), _PTR(min=300))] * 5) + + # 5x i2 series + ([(_run('B1', num_samples=2048, num_iters=2, + verbose=True), _PTR(min=300))] * 5) + )) + doctor = BenchmarkDoctor(self.args, driver) + with captured_output() as (out, _): + measurements = doctor.measure('B1') + + driver.assert_called_all_expected() + self.assert_contains( + ['name', + 'B1 O i1a', 'B1 O i1b', 'B1 O i1c', 'B1 O i1d', 'B1 O i1e', + 'B1 O i2a', 'B1 O i2b', 'B1 O i2c', 'B1 O i2d', 'B1 O i2e'], + measurements.keys()) + self.assertEquals(measurements['name'], 'B1') + self.assert_contains( + ['Calibrating num-samples for B1:', + 'Runtime 300 μs yields 4096 adjusted samples per second.', + 'Measuring B1, 5 x i1 (2048 samples), 5 x i2 (2048 samples)'], + self.logs['debug']) + + def test_benchmark_name_matches_capital_words_conventions(self): + driver = BenchmarkDriverMock(tests=[ + 'BenchmarkName', 'CapitalWordsConvention', 'ABBRName', + 'wrongCase', 'Wrong_convention']) + with captured_output() as (out, _): + doctor = BenchmarkDoctor(self.args, driver) + doctor.check() + output = out.getvalue() + + self.assertIn('naming: ', output) + self.assertNotIn('BenchmarkName', output) + self.assertNotIn('CapitalWordsConvention', output) + self.assertNotIn('ABBRName', output) + self.assert_contains( + ["'wrongCase' name doesn't conform to UpperCamelCase convention.", + "'Wrong_convention' name doesn't conform to UpperCamelCase " + "convention."], self.logs['error']) + self.assert_contains( + ['See http://bit.ly/UpperCamelCase'], self.logs['info']) + + def test_benchmark_name_is_at_most_40_chars_long(self): + driver = BenchmarkDriverMock(tests=[ + 'BenchmarkName', + 'ThisTestNameIsTooLongAndCausesOverflowsInReports']) + with captured_output() as (out, _): + doctor = BenchmarkDoctor(self.args, driver) + doctor.check() + output = out.getvalue() + + self.assertIn('naming: ', output) + self.assertNotIn('BenchmarkName', output) + self.assert_contains( + ["'ThisTestNameIsTooLongAndCausesOverflowsInReports' name is " + "48 characters long."], self.logs['error']) + self.assert_contains( + ["Benchmark name should not be longer than 40 characters."], + self.logs['info']) + + def test_benchmark_runtime_range(self): + """Optimized benchmark should run in less then 2500 μs. + + With runtimes less than 2500 μs there is better than 1:4 chance of + being interrupted in the middle of measurement due to elapsed 10 ms + quantum used by macos scheduler. + + Warn about longer runtime. Runtimes over half a second are an error. + """ + def measurements(name, runtime): + return {'name': name, + name + ' O i1a': _PTR(min=runtime + 2), + name + ' O i2a': _PTR(min=runtime)} + + with captured_output() as (out, _): + doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([])) + doctor.analyze(measurements('Cheetah', 200)) + doctor.analyze(measurements('Hare', 2501)) + doctor.analyze(measurements('Tortoise', 500000)) + doctor.analyze({'name': 'OverheadTurtle', + 'OverheadTurtle O i1a': _PTR(min=800000), + 'OverheadTurtle O i2a': _PTR(min=700000)}) + output = out.getvalue() + + self.assertIn('runtime: ', output) + self.assertNotIn('Cheetah', output) + self.assert_contains(["'Hare' execution took at least 2501 μs."], + self.logs['warning']) + self.assert_contains( + ["Decrease the workload of 'Hare' by a factor of 2, " + "to be less than 2500 μs."], self.logs['info']) + self.assert_contains( + ["'Tortoise' execution took at least 500000 μs."], + self.logs['error']) + self.assert_contains( + ["Decrease the workload of 'Tortoise' by a factor of 256, " + "to be less than 2500 μs."], self.logs['info']) + self.assert_contains( + ["'OverheadTurtle' execution took at least 600000 μs" + " (excluding the setup overhead)."], + self.logs['error']) + + def test_benchmark_has_no_significant_setup_overhead(self): + with captured_output() as (out, _): + doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([])) + doctor.analyze({ + 'name': 'NoOverhead', # not 'significant' enough + # Based on DropFirstArray a10/e10: overhead 3.7% (6 μs) + 'NoOverhead O i1a': _PTR(min=162), + 'NoOverhead O i2a': _PTR(min=159)}) + doctor.analyze({ + 'name': 'SO', # Setup Overhead + # Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs) + 'SO O i1a': _PTR(min=69), 'SO O i1b': _PTR(min=70), + 'SO O i2a': _PTR(min=67), 'SO O i2b': _PTR(min=68)}) + doctor.analyze({'name': 'Zero', 'Zero O i1a': _PTR(min=0), + 'Zero O i2a': _PTR(min=0)}) + output = out.getvalue() + + self.assertIn('runtime: ', output) + self.assertNotIn('NoOverhead', output) + self.assertNotIn('ZeroRuntime', output) + self.assert_contains( + ["'SO' has setup overhead of 4 μs (5.8%)."], + self.logs['error']) + self.assert_contains( + ["Move initialization of benchmark data to the `setUpFunction` " + "registered in `BenchmarkInfo`."], self.logs['info']) + + def test_benchmark_has_constant_memory_use(self): + """Benchmark's memory footprint must not vary with num-iters.""" + with captured_output() as (out, _): + doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([])) + doctor.analyze({ + # The threshold of 15 pages was estimated from previous + # measurements. The normal range should be probably aproximated + # by a function instead of a simple constant. + # TODO: re-evaluate normal range from whole SBS + 'name': 'ConstantMemory', + 'ConstantMemory O i1a': _PTR(mem_pages=1460), + 'ConstantMemory O i2a': _PTR(mem_pages=(1460 + 15))}) + doctor.analyze({ + 'name': 'VariableMemory', # ObserverForwardStruct + 'VariableMemory O i1a': _PTR(mem_pages=1460), + 'VariableMemory O i1b': _PTR(mem_pages=1472), + # i2 series start at 290 pages higher + 'VariableMemory O i2a': _PTR(mem_pages=1750), + 'VariableMemory O i2b': _PTR(mem_pages=1752)}) + measurements = dict([ + ('HighVariance O i{0}{1}'.format(num_iters, suffix), + _PTR(mem_pages=num_pages)) + for num_iters, pages in [ + (1, [6200, 5943, 4818, 5612, 5469]), + (2, [6244, 5832, 4674, 5176, 5490])] + for num_pages, suffix in zip(pages, list('abcde'))]) + measurements['name'] = 'HighVariance' # Array2D + doctor.analyze(measurements) + output = out.getvalue() + + self.assertIn('memory: ', output) + self.assertNotIn('ConstantMemory', output) + self.assert_contains( + ["'VariableMemory' varies the memory footprint of the base " + "workload depending on the `num-iters`."], + self.logs['error']) + self.assert_contains( + ["'HighVariance' has very wide range of memory used between " + "independent, repeated measurements."], + self.logs['warning']) + + +if __name__ == '__main__': + unittest.main() diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py index fc7f4e4454e56..f1f578cb54cbe 100644 --- a/benchmark/scripts/test_compare_perf_tests.py +++ b/benchmark/scripts/test_compare_perf_tests.py @@ -19,36 +19,174 @@ import tempfile import unittest -from StringIO import StringIO -from contextlib import contextmanager - +from compare_perf_tests import LogParser from compare_perf_tests import PerformanceTestResult +from compare_perf_tests import PerformanceTestSamples from compare_perf_tests import ReportFormatter from compare_perf_tests import ResultComparison +from compare_perf_tests import Sample from compare_perf_tests import TestComparator from compare_perf_tests import main from compare_perf_tests import parse_args +from test_utils import captured_output -@contextmanager -def captured_output(): - new_out, new_err = StringIO(), StringIO() - old_out, old_err = sys.stdout, sys.stderr - try: - sys.stdout, sys.stderr = new_out, new_err - yield sys.stdout, sys.stderr - finally: - sys.stdout, sys.stderr = old_out, old_err +class TestSample(unittest.TestCase): + def test_has_named_fields(self): + s = Sample(1, 2, 3) + self.assertEquals(s.i, 1) + self.assertEquals(s.num_iters, 2) + self.assertEquals(s.runtime, 3) -class TestPerformanceTestResult(unittest.TestCase): + def test_is_iterable(self): + s = Sample(1, 2, 3) + self.assertEquals(s[0], 1) + self.assertEquals(s[1], 2) + self.assertEquals(s[2], 3) + +class TestPerformanceTestSamples(unittest.TestCase): + def setUp(self): + self.samples = PerformanceTestSamples('B1') + self.samples.add(Sample(7, 42, 1000)) + + def test_has_name(self): + self.assertEquals(self.samples.name, 'B1') + + def test_stores_samples(self): + self.assertEquals(self.samples.count, 1) + s = self.samples.samples[0] + self.assertTrue(isinstance(s, Sample)) + self.assertEquals(s.i, 7) + self.assertEquals(s.num_iters, 42) + self.assertEquals(s.runtime, 1000) + + def assertEqualFiveNumberSummary(self, ss, expected_fns): + e_min, e_q1, e_median, e_q3, e_max = expected_fns + self.assertEquals(ss.min, e_min) + self.assertEquals(ss.q1, e_q1) + self.assertEquals(ss.median, e_median) + self.assertEquals(ss.q3, e_q3) + self.assertEquals(ss.max, e_max) + + def test_computes_five_number_summary(self): + self.assertEqualFiveNumberSummary( + self.samples, (1000, 1000, 1000, 1000, 1000)) + self.samples.add(Sample(2, 1, 1100)) + self.assertEqualFiveNumberSummary( + self.samples, (1000, 1000, 1100, 1100, 1100)) + self.samples.add(Sample(3, 1, 1050)) + self.assertEqualFiveNumberSummary( + self.samples, (1000, 1000, 1050, 1050, 1100)) + self.samples.add(Sample(4, 1, 1025)) + self.assertEqualFiveNumberSummary( + self.samples, (1000, 1025, 1050, 1100, 1100)) + self.samples.add(Sample(5, 1, 1075)) + self.assertEqualFiveNumberSummary( + self.samples, (1000, 1025, 1050, 1075, 1100)) + + def test_computes_inter_quartile_range(self): + self.assertEquals(self.samples.iqr, 0) + self.samples.add(Sample(2, 1, 1025)) + self.samples.add(Sample(3, 1, 1050)) + self.samples.add(Sample(4, 1, 1075)) + self.samples.add(Sample(5, 1, 1100)) + self.assertEquals(self.samples.iqr, 50) + + def assertEqualStats(self, stats, expected_stats): + for actual, expected in zip(stats, expected_stats): + self.assertAlmostEquals(actual, expected, places=2) + + def test_computes_mean_sd_cv(self): + ss = self.samples + self.assertEqualStats( + (ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0)) + self.samples.add(Sample(2, 1, 1100)) + self.assertEqualStats( + (ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100)) + + def test_computes_range_spread(self): + ss = self.samples + self.assertEqualStats( + (ss.range, ss.spread), (0, 0)) + self.samples.add(Sample(2, 1, 1100)) + self.assertEqualStats( + (ss.range, ss.spread), (100, 10.0 / 100)) + + def test_init_with_samples(self): + self.samples = PerformanceTestSamples( + 'B2', [Sample(0, 1, 1000), Sample(1, 1, 1100)]) + self.assertEquals(self.samples.count, 2) + self.assertEqualStats( + (self.samples.mean, self.samples.sd, + self.samples.range, self.samples.spread), + (1050.0, 70.71, 100, 9.52 / 100)) + + def test_can_handle_zero_runtime(self): + # guard against dividing by 0 + self.samples = PerformanceTestSamples('Zero') + self.samples.add(Sample(0, 1, 0)) + self.assertEqualStats( + (self.samples.mean, self.samples.sd, self.samples.cv, + self.samples.range, self.samples.spread), + (0, 0, 0.0, 0, 0.0)) + + def test_excludes_outliers(self): + ss = [Sample(*map(int, s.split())) for s in + '0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, ' + '5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, ' + '10 1 1050, 11 1 949, 12 1 1151'.split(',')] + self.samples = PerformanceTestSamples('Outliers', ss) + self.assertEquals(self.samples.count, 13) + self.assertEqualStats( + (self.samples.mean, self.samples.sd), (1050, 52.36)) + + self.samples.exclude_outliers() + + self.assertEquals(self.samples.count, 11) + self.assertEquals(self.samples.outliers, ss[11:]) + self.assertEqualFiveNumberSummary( + self.samples, (1000, 1025, 1050, 1075, 1100)) + self.assertEqualStats( + (self.samples.mean, self.samples.sd), (1050, 35.36)) + + def test_excludes_outliers_zero_IQR(self): + self.samples = PerformanceTestSamples('Tight') + self.samples.add(Sample(0, 2, 23)) + self.samples.add(Sample(1, 2, 18)) + self.samples.add(Sample(2, 2, 18)) + self.assertEquals(self.samples.iqr, 0) + + self.samples.exclude_outliers() + + self.assertEquals(self.samples.count, 2) + self.assertEqualStats( + (self.samples.min, self.samples.max), (18, 18)) + + def test_excludes_outliers_top_only(self): + ss = [Sample(*map(int, s.split())) for s in + '0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3'.split(',')] + self.samples = PerformanceTestSamples('Top', ss) + self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3)) + self.assertEquals(self.samples.iqr, 0) + + self.samples.exclude_outliers(top_only=True) + + self.assertEquals(self.samples.count, 4) + self.assertEqualStats((self.samples.min, self.samples.max), (1, 2)) + + +class TestPerformanceTestResult(unittest.TestCase): def test_init(self): log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884' r = PerformanceTestResult(log_line.split(',')) + self.assertEquals(r.test_num, '1') self.assertEquals(r.name, 'AngryPhonebook') - self.assertEquals((r.samples, r.min, r.max, r.mean, r.sd, r.median), - (20, 10664, 12933, 11035, 576, 10884)) + self.assertEquals( + (r.num_samples, r.min, r.max, r.mean, r.sd, r.median), + (20, 10664, 12933, 11035, 576, 10884)) + self.assertEquals(r.samples, None) log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336' r = PerformanceTestResult(log_line.split(',')) @@ -60,25 +198,7 @@ def test_repr(self): self.assertEquals( str(r), '' - ) - - def test_header(self): - self.assertEquals(PerformanceTestResult.header, - ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')) - - def test_values(self): - log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884' - r = PerformanceTestResult(log_line.split(',')) - self.assertEquals( - r.values(), - ('AngryPhonebook', '10664', '12933', '11035', '—') - ) - log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336' - r = PerformanceTestResult(log_line.split(',')) - self.assertEquals( - r.values(), - ('AngryPhonebook', '12045', '12045', '12045', '10510336') + 'min:10664 max:12933 mean:11035 sd:576 median:10884>' ) def test_merge(self): @@ -90,21 +210,21 @@ def test_merge(self): [line.split(',') for line in tests]) def as_tuple(r): - return (r.min, r.max, round(r.mean, 2), round(r.sd, 2), r.median, - r.max_rss) + return (r.num_samples, r.min, r.max, round(r.mean, 2), + r.sd, r.median, r.max_rss) r = results[0] self.assertEquals(as_tuple(r), - (12045, 12045, 12045, 0, 12045, 10510336)) + (1, 12045, 12045, 12045, 0, 12045, 10510336)) r.merge(results[1]) - self.assertEquals(as_tuple(r), - (12045, 12325, 12185, 197.99, 12045, 10510336)) + self.assertEquals(as_tuple(r), # drops SD and median + (2, 12045, 12325, 12185, 0, 0, 10510336)) r.merge(results[2]) - self.assertEquals(as_tuple(r), - (11616, 12325, 11995.33, 357.10, 12045, 10510336)) + self.assertEquals(as_tuple(r), # picks smaller of the MAX_RSS + (3, 11616, 12325, 11995.33, 0, 0, 10502144)) r.merge(results[3]) self.assertEquals(as_tuple(r), - (11616, 12325, 12064, 322.29, 12045, 10510336)) + (4, 11616, 12325, 12064, 0, 0, 10498048)) class TestResultComparison(unittest.TestCase): @@ -142,65 +262,20 @@ def test_init(self): ResultComparison, self.r0, self.r1 ) - def test_header(self): - self.assertEquals(ResultComparison.header, - ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP')) - - def test_values(self): - rc = ResultComparison(self.r1, self.r2) - self.assertEquals( - rc.values(), - ('AngryPhonebook', '12325', '11616', '-5.8%', '1.06x') - ) - # other way around - rc = ResultComparison(self.r2, self.r1) - self.assertEquals( - rc.values(), - ('AngryPhonebook', '11616', '12325', '+6.1%', '0.94x') - ) - def test_values_is_dubious(self): + self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious) self.r2.max = self.r1.min + 1 # new.min < old.min < new.max - rc = ResultComparison(self.r1, self.r2) - self.assertEquals(rc.values()[4], '1.06x (?)') + self.assertTrue(ResultComparison(self.r1, self.r2).is_dubious) # other way around: old.min < new.min < old.max - rc = ResultComparison(self.r2, self.r1) - self.assertEquals(rc.values()[4], '0.94x (?)') + self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious) -class OldAndNewLog(unittest.TestCase): +class FileSystemIntegration(unittest.TestCase): def setUp(self): # Create a temporary directory self.test_dir = tempfile.mkdtemp() - self.old_log = self.write_temp_file('old.log', """ -1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365 -2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445 -3,Array2D,20,335831,400221,346622,0,346622,28297216 -4,ArrayAppend,20,23641,29000,24990,0,24990,11149926 -34,BitCount,20,3,4,4,0,4,10192896 -35,ByteSwap,20,4,6,4,0,4,10185933 - -Totals,269,67351871,70727022,68220188,0,0,0 -""") - # Log lines are deliberately in descending sort order in order to test - # the ordering of the comparison results. - # ArrayAppend is included twice to test results merging. - self.new_log = self.write_temp_file('new.log', """ -265,TwoSum,20,5006,5679,5111,0,5111 -35,ByteSwap,20,0,0,0,0,0 -34,BitCount,20,9,9,9,0,9 -4,ArrayAppend,20,23641,29000,24990,0,24990 -3,Array2D,20,335831,400221,346622,0,346622 -1,AngryPhonebook,20,10458,12714,11000,0,11000 - -Totals,269,67351871,70727022,68220188,0,0,0 -4,ArrayAppend,1,20000,20000,20000,0,20000 - -Totals,1,20000,20000,20000,0,0,0 -""") - def tearDown(self): # Remove the directory after the test shutil.rmtree(self.test_dir) @@ -211,27 +286,217 @@ def write_temp_file(self, file_name, data): f.write(data) return temp_file_name + +class OldAndNewLog(unittest.TestCase): + old_log_content = """1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365 +2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445 +3,Array2D,20,335831,400221,346622,0,346622,28297216 +4,ArrayAppend,20,23641,29000,24990,0,24990,11149926 +34,BitCount,20,3,4,4,0,4,10192896 +35,ByteSwap,20,4,6,4,0,4,10185933""" + + new_log_content = """265,TwoSum,20,5006,5679,5111,0,5111 +35,ByteSwap,20,0,0,0,0,0 +34,BitCount,20,9,9,9,0,9 +4,ArrayAppend,20,20000,29000,24990,0,24990 +3,Array2D,20,335831,400221,346622,0,346622 +1,AngryPhonebook,20,10458,12714,11000,0,11000""" + + old_results = dict([(r.name, r) + for r in + map(PerformanceTestResult, + [line.split(',') + for line in + old_log_content.splitlines()])]) + + new_results = dict([(r.name, r) + for r in + map(PerformanceTestResult, + [line.split(',') + for line in + new_log_content.splitlines()])]) + def assert_report_contains(self, texts, report): assert not isinstance(texts, str) for text in texts: self.assertIn(text, report) +class TestLogParser(unittest.TestCase): + def test_parse_results_csv(self): + """Ignores header row, empty lines and Totals row.""" + log = """#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us) +34,BitCount,20,3,4,4,0,4 + +Totals,269 +""" + parser = LogParser() + results = parser.parse_results(log.splitlines()) + self.assertTrue(isinstance(results[0], PerformanceTestResult)) + self.assertEquals(results[0].name, 'BitCount') + + def test_parse_results_tab_delimited(self): + log = '34\tBitCount\t20\t3\t4\t4\t0\t4' + parser = LogParser() + results = parser.parse_results(log.splitlines()) + self.assertTrue(isinstance(results[0], PerformanceTestResult)) + self.assertEquals(results[0].name, 'BitCount') + + def test_parse_results_formatted_text(self): + """Parse format that Benchmark_Driver prints to console""" + log = (""" +# TEST SAMPLES MIN(μs) MAX(μs) MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B) +3 Array2D 20 2060 2188 2099 0 2099 20915200 +Totals 281""") + parser = LogParser() + results = parser.parse_results(log.splitlines()[1:]) # without 1st \n + self.assertTrue(isinstance(results[0], PerformanceTestResult)) + r = results[0] + self.assertEquals(r.name, 'Array2D') + self.assertEquals(r.max_rss, 20915200) + + def test_parse_results_verbose(self): + """Parse multiple performance test results with 2 sample formats: + single line for N = 1; two lines for N > 1. + """ + verbose_log = """--- DATA --- +#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us) +Running AngryPhonebook for 3 samples. + Measuring with scale 78. + Sample 0,11812 + Measuring with scale 90. + Sample 1,13898 + Measuring with scale 91. + Sample 2,11467 +1,AngryPhonebook,3,11467,13898,12392,1315,11812 +Running Array2D for 3 samples. + Sample 0,369900 + Sample 1,381039 + Sample 2,371043 +3,Array2D,3,369900,381039,373994,6127,371043 + +Totals,2""" + parser = LogParser() + results = parser.parse_results(verbose_log.split('\n')) + + r = results[0] + self.assertEquals( + (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median), + ('AngryPhonebook', 11467, 13898, 12392, 1315, 11812) + ) + self.assertEquals(r.num_samples, r.samples.num_samples) + self.assertEquals(results[0].samples.all_samples, + [(0, 78, 11812), (1, 90, 13898), (2, 91, 11467)]) + + r = results[1] + self.assertEquals( + (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median), + ('Array2D', 369900, 381039, 373994, 6127, 371043) + ) + self.assertEquals(r.num_samples, r.samples.num_samples) + self.assertEquals(results[1].samples.all_samples, + [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)]) + + def test_parse_environment_verbose(self): + """Parse stats about environment in verbose mode.""" + verbose_log = """ MAX_RSS 8937472 - 8904704 = 32768 (8 pages) + ICS 1338 - 229 = 1109 + VCS 2 - 1 = 1 +2,AngryPhonebook,3,11269,11884,11657,338,11820 +""" + parser = LogParser() + results = parser.parse_results(verbose_log.split('\n')) + + r = results[0] + self.assertEquals(r.max_rss, 32768) + self.assertEquals(r.mem_pages, 8) + self.assertEquals(r.voluntary_cs, 1) + self.assertEquals(r.involuntary_cs, 1109) + + def test_results_from_merge(self): + """Parsing concatenated log merges same PerformanceTestResults""" + concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990 +4,ArrayAppend,1,20000,20000,20000,0,20000""" + results = LogParser.results_from_string(concatenated_logs) + self.assertEquals(results.keys(), ['ArrayAppend']) + result = results['ArrayAppend'] + self.assertTrue(isinstance(result, PerformanceTestResult)) + self.assertEquals(result.min, 20000) + self.assertEquals(result.max, 29000) + + def test_results_from_merge_verbose(self): + """Parsing verbose log merges all PerformanceTestSamples. + ...this should technically be on TestPerformanceTestResult, but it's + easier to write here. ¯\_(ツ)_/¯""" + concatenated_logs = """ + Sample 0,355883 + Sample 1,358817 + Sample 2,353552 + Sample 3,350815 +3,Array2D,4,350815,358817,354766,3403,355883 + Sample 0,363094 + Sample 1,369169 + Sample 2,376131 + Sample 3,364245 +3,Array2D,4,363094,376131,368159,5931,369169""" + results = LogParser.results_from_string(concatenated_logs) + self.assertEquals(results.keys(), ['Array2D']) + result = results['Array2D'] + self.assertTrue(isinstance(result, PerformanceTestResult)) + self.assertEquals(result.min, 350815) + self.assertEquals(result.max, 376131) + self.assertEquals(result.median, 363094) + self.assertAlmostEquals(result.sd, 8443.37, places=2) + self.assertAlmostEquals(result.mean, 361463.25, places=2) + self.assertEquals(result.num_samples, 8) + samples = result.samples + self.assertTrue(isinstance(samples, PerformanceTestSamples)) + self.assertEquals(samples.count, 8) + + def test_excludes_outliers_from_samples(self): + verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples. + Measuring with scale 2. + Sample 0,455 + Measuring with scale 2. + Sample 1,203 + Measuring with scale 2. + Sample 2,205 + Measuring with scale 2. + Sample 3,207 + Measuring with scale 2. + Sample 4,208 + Measuring with scale 2. + Sample 5,206 + Measuring with scale 2. + Sample 6,205 + Measuring with scale 2. + Sample 7,206 + Measuring with scale 2. + Sample 8,208 + Measuring with scale 2. + Sample 9,184 +65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206 +""" + parser = LogParser() + result = parser.parse_results(verbose_log.split('\n'))[0] + self.assertEquals(result.num_samples, 10) + self.assertEquals(result.samples.count, 8) + self.assertEquals(len(result.samples.outliers), 2) + + class TestTestComparator(OldAndNewLog): def test_init(self): - old_log, new_log = self.old_log, self.new_log - def names(tests): return [t.name for t in tests] - tc = TestComparator(old_log, new_log, 0.05) + tc = TestComparator(self.old_results, self.new_results, 0.05) self.assertEquals(names(tc.unchanged), ['AngryPhonebook', 'Array2D']) self.assertEquals(names(tc.increased), ['ByteSwap', 'ArrayAppend']) self.assertEquals(names(tc.decreased), ['BitCount']) self.assertEquals(names(tc.added), ['TwoSum']) self.assertEquals(names(tc.removed), ['AnyHashableWithAClass']) # other way around - tc = TestComparator(new_log, old_log, 0.05) + tc = TestComparator(self.new_results, self.old_results, 0.05) self.assertEquals(names(tc.unchanged), ['AngryPhonebook', 'Array2D']) self.assertEquals(names(tc.increased), ['BitCount']) self.assertEquals(names(tc.decreased), ['ByteSwap', 'ArrayAppend']) @@ -239,7 +504,7 @@ def names(tests): self.assertEquals(names(tc.removed), ['TwoSum']) # delta_threshold determines the sorting into change groups; # report only change above 100% (ByteSwap's runtime went to 0): - tc = TestComparator(old_log, new_log, 1) + tc = TestComparator(self.old_results, self.new_results, 1) self.assertEquals( names(tc.unchanged), ['AngryPhonebook', 'Array2D', 'ArrayAppend', 'BitCount'] @@ -251,7 +516,7 @@ def names(tests): class TestReportFormatter(OldAndNewLog): def setUp(self): super(TestReportFormatter, self).setUp() - self.tc = TestComparator(self.old_log, self.new_log, 0.05) + self.tc = TestComparator(self.old_results, self.new_results, 0.05) self.rf = ReportFormatter(self.tc, '', '', changes_only=False) self.markdown = self.rf.markdown() self.git = self.rf.git() @@ -266,6 +531,37 @@ def assert_git_contains(self, texts): def assert_html_contains(self, texts): self.assert_report_contains(texts, self.html) + def test_values(self): + self.assertEquals( + ReportFormatter.values(PerformanceTestResult( + '1,AngryPhonebook,20,10664,12933,11035,576,10884'.split(','))), + ('AngryPhonebook', '10664', '12933', '11035', '—') + ) + self.assertEquals( + ReportFormatter.values(PerformanceTestResult( + '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336' + .split(','))), + ('AngryPhonebook', '12045', '12045', '12045', '10510336') + ) + + r1 = PerformanceTestResult( + '1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336'.split(',')) + r2 = PerformanceTestResult( + '1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144'.split(',')) + self.assertEquals( + ReportFormatter.values(ResultComparison(r1, r2)), + ('AngryPhonebook', '12325', '11616', '-5.8%', '1.06x') + ) + self.assertEquals( + ReportFormatter.values(ResultComparison(r2, r1)), + ('AngryPhonebook', '11616', '12325', '+6.1%', '0.94x') + ) + r2.max = r1.min + 1 + self.assertEquals( + ReportFormatter.values(ResultComparison(r1, r2))[4], + '1.06x (?)' # is_dubious + ) + def test_justified_columns(self): """Table columns are all formated with same width, defined by the longest value. @@ -281,6 +577,16 @@ def test_column_headers(self): """Report contains table headers for ResultComparisons and changed PerformanceTestResults. """ + performance_test_result = self.tc.added[0] + self.assertEquals( + ReportFormatter.header_for(performance_test_result), + ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS') + ) + comparison_result = self.tc.increased[0] + self.assertEquals( + ReportFormatter.header_for(comparison_result), + ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP') + ) self.assert_markdown_contains([ 'TEST | OLD | NEW | DELTA | SPEEDUP', '--- | --- | --- | --- | --- ', @@ -452,7 +758,7 @@ def test_branch_arguments(self): self.assertEquals(args.new_branch, 'amazing-optimization') -class Test_compare_perf_tests_main(OldAndNewLog): +class Test_compare_perf_tests_main(OldAndNewLog, FileSystemIntegration): """Integration test that invokes the whole comparison script.""" markdown = [ 'Regression (1)', @@ -466,6 +772,11 @@ class Test_compare_perf_tests_main(OldAndNewLog): ] html = ['', "BitCount"] + def setUp(self): + super(Test_compare_perf_tests_main, self).setUp() + self.old_log = self.write_temp_file('old.log', self.old_log_content) + self.new_log = self.write_temp_file('new.log', self.new_log_content) + def execute_main_with_format(self, report_format, test_output=False): report_file = self.test_dir + 'report.log' args = ['compare_perf_tests.py', diff --git a/benchmark/scripts/test_utils.py b/benchmark/scripts/test_utils.py new file mode 100644 index 0000000000000..c83d130b73d03 --- /dev/null +++ b/benchmark/scripts/test_utils.py @@ -0,0 +1,108 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ===--- test_utils.py ---------------------------------------------------===// +# +# This source file is part of the Swift.org open source project +# +# Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors +# Licensed under Apache License v2.0 with Runtime Library Exception +# +# See https://swift.org/LICENSE.txt for license information +# See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +# +# ===---------------------------------------------------------------------===// +""" +Homegrown unit testing helpers. + +The issue is that the unittest.mock was added in Python 3.3 and we need to run +on Python 2.7. + +`Stub` and `Mock` are very rudimentary and support only the limited subset of +common unit testing patterns that is used in this project. +""" + +import logging +import sys + +from StringIO import StringIO +from contextlib import contextmanager + + +@contextmanager +def captured_output(): + """Capture stdout and stderr and return their output as string buffers.""" + new_out, new_err = StringIO(), StringIO() + old_out, old_err = sys.stdout, sys.stderr + try: + sys.stdout, sys.stderr = new_out, new_err + yield sys.stdout, sys.stderr + finally: + sys.stdout, sys.stderr = old_out, old_err + + +class Stub: + """Object literal stub for value objects.""" + + def __init__(self, **attributes): + """All named parameters will create properties on this object.""" + self.__dict__.update(attributes) + + +class Mock(object): + """Minimal infrastructure for manually mocking calls to single method. + + Classes inheriting from Mock are expected to create their own mock of the + tested method with appropriate signature, which appends the call arguments + as tuple to the `calls` list and returns the canned response retrieved from + the `respond` dictionary. + """ + + def __init__(self, responses=None): + """Optionaly initialized with a list of expected calls. See expect.""" + self.calls = [] + self.expected = [] + self.respond = dict() + responses = responses or [] + for call_args, response in responses: + self.expect(call_args, response) + + def expect(self, call_args, response): + """Expect invocation of tested method with given arguments. + + Stores the canned reponse in the `respond` dictionary. + """ + call_args = tuple(call_args) + self.expected.append(call_args) + self.respond[call_args] = response + + def assert_called_with(self, expected_args): + """Verify that the tested method was called with provided arguments.""" + expected_args = tuple(expected_args) + assert expected_args in self.calls, ( + 'Expected: {0} in Called: {1}'.format(expected_args, self.calls)) + + def assert_called_all_expected(self): + """Verify that all expeced invocations of tested method were called.""" + assert self.calls == self.expected, ( + '\nExpected: {0}, \n Called: {1}'.format( + self.expected, self.calls)) + + +class MockLoggingHandler(logging.Handler): + """Mock logging handler to check for expected logs.""" + + def __init__(self, *args, **kwargs): + """Prepare the logger for recording messages for each log level.""" + self.reset() + super(MockLoggingHandler, self).__init__(*args, **kwargs) + + def emit(self, record): + """Store the message in list for the given log level.""" + self.messages[record.levelname.lower()].append(record.getMessage()) + + def reset(self): + """Clear all log messages.""" + self.messages = { + 'debug': [], 'info': [], 'warning': [], 'error': [], 'critical': [] + } diff --git a/benchmark/utils/DriverUtils.swift b/benchmark/utils/DriverUtils.swift index 9e9b0e067507a..53f6ea027d058 100644 --- a/benchmark/utils/DriverUtils.swift +++ b/benchmark/utils/DriverUtils.swift @@ -160,6 +160,7 @@ struct TestConfig { LogMemory: \(logMemory) SampleTime: \(sampleTime) FixedIters: \(fixedNumIters) + Delimiter: \(String(reflecting: delim)) Tests Filter: \(c.tests ?? []) Tests to run: \(testList) diff --git a/test/benchmark/Benchmark_Driver.test-sh b/test/benchmark/Benchmark_Driver.test-sh index dec24a369da2b..ad7559c6147da 100644 --- a/test/benchmark/Benchmark_Driver.test-sh +++ b/test/benchmark/Benchmark_Driver.test-sh @@ -4,19 +4,8 @@ // REQUIRES: CMAKE_GENERATOR=Ninja // Integration tests between Benchmark_Driver and Benchmark_O -// TODO: Keep the "run just once" check and move the rest into unit tests for -// Benchmark_Driver, as they are redundant and unnecessarily slow. - -// RUN: %Benchmark_Driver run Ackermann | %FileCheck %s --check-prefix RUNNAMED -// RUNNAMED: Ackermann - -// RUN: %Benchmark_Driver run 1 | %FileCheck %s --check-prefix RUNBYNUMBER -// RUNBYNUMBER: Ackermann // RUN: %Benchmark_Driver run 1 Ackermann 1 \ // RUN: | %FileCheck %s --check-prefix RUNJUSTONCE // RUNJUSTONCE-LABEL: Ackermann // RUNJUSTONCE-NOT: Ackermann - -// RUN: %Benchmark_Driver run -f Acker | %FileCheck %s --check-prefix RUNFILTER -// RUNFILTER: Ackermann diff --git a/test/benchmark/Benchmark_O.test.md b/test/benchmark/Benchmark_O.test.md index 3e07a76aba807..ac8419eb54e94 100644 --- a/test/benchmark/Benchmark_O.test.md +++ b/test/benchmark/Benchmark_O.test.md @@ -143,6 +143,17 @@ LOGMEMORY: ,{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}} LOGVERBOSE-LABEL: Running AngryPhonebook for 2 samples. ```` +Verify the specified delimiter is used when logging to console. The non-verbose +variant of this invocation is used from [`Benchmark_Driver`][BD] to get the list +of all tests. That's why it is *crucial* to tests this integration point. + +```` +RUN: %Benchmark_O --list --skip-tags= --delim=$'\t' --verbose \ +RUN: | %FileCheck %s --check-prefix LOGVERBOSEDELIM +LOGVERBOSEDELIM: Delimiter: "\t" +LOGVERBOSEDELIM: # Test [Tags] +```` + ## Error Handling ````