From 49ddd96c83e567a88857d06a9b3d6a4b69e54fcf Mon Sep 17 00:00:00 2001 From: Pavol Vaskovic Date: Thu, 1 Jun 2017 19:39:38 +0200 Subject: [PATCH 1/6] Added documentation and test coverage. compare_perf_test.py is now covered with unit tests and public methods are documented in the implementation. Minor refactoring to better conform to Python conventions: * classes declared in new style * proper private method prefix of single underscore * replacing map with list comprehension where it was clearer Unit test are executed as part of validation-test. .gitignore was modified to ignore .coverage and htmlcov artifacts generated by the coverage.py package --- .gitignore | 2 + benchmark/scripts/compare_perf_tests.py | 150 ++++++++---- benchmark/scripts/test_compare_perf_tests.py | 229 ++++++++++++++++++ .../Python/benchmark-scripts.test-sh | 2 + 4 files changed, 336 insertions(+), 47 deletions(-) create mode 100644 benchmark/scripts/test_compare_perf_tests.py create mode 100644 validation-test/Python/benchmark-scripts.test-sh diff --git a/.gitignore b/.gitignore index 6b9ce48e3ab00..74aa31bc37df7 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,5 @@ compile_commands.json 8 4 SortedCFDatabase.def +htmlcov +.coverage diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py index 8f636364c586c..7d51c68a90a42 100755 --- a/benchmark/scripts/compare_perf_tests.py +++ b/benchmark/scripts/compare_perf_tests.py @@ -21,8 +21,22 @@ from math import sqrt -class PerformanceTestResult: +class PerformanceTestResult(object): + """PerformanceTestResult holds results from executing individual benchmark + from Swift Benchmark Suite as reported by test driver (Benchmark_O, + Benchmark_Onone, Benchmark_Ounchecked or Benchmark_Driver). + + It depends on the log format emmited by the test driver in the form: + #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B) + + The last column, MAX_RSS, is emmited only for runs instrumented by the + Benchmark_Driver to measure rough memory use during the execution of the + benchmark. + """ def __init__(self, csv_row): + """PerformanceTestResult instance is created from an iterable with + length of 8 or 9. (Like a row provided by the CSV parser.) + """ # csv_row[0] is just an ordinal number of the test - skip that self.name = csv_row[1] # Name of the performance test self.samples = int(csv_row[2]) # Number of measurement samples taken @@ -36,25 +50,41 @@ def __init__(self, csv_row): self.median = int(csv_row[7]) # Median runtime (ms) self.max_rss = ( # Maximum Resident Set Size (B) int(csv_row[8]) if len(csv_row) > 8 else None) - # TODO if we really want to compute mean MAX_RSS: self.S_memory + + def __repr__(self): + return ( + ''.format(self)) @property - def sd(self): # Standard Deviation (ms) + def sd(self): + """Standard Deviation (ms)""" return (0 if self.samples < 2 else sqrt(self.S_runtime / (self.samples - 1))) - # Compute running variance, B. P. Welford's method - # See Knuth TAOCP vol 2, 3rd edition, page 232, or - # https://www.johndcook.com/blog/standard_deviation/ - # M is mean, Standard Deviation is defined as sqrt(S/k-1) @staticmethod def running_mean_variance((k, M_, S_), x): + """ + Compute running variance, B. P. Welford's method + See Knuth TAOCP vol 2, 3rd edition, page 232, or + https://www.johndcook.com/blog/standard_deviation/ + M is mean, Standard Deviation is defined as sqrt(S/k-1) + """ k = float(k + 1) M = M_ + (x - M_) / k S = S_ + (x - M_) * (x - M) return (k, M, S) def merge(self, r): + """Merging test results recomputes min and max. + It attempts to recompute mean and standard deviation when all_samples + are available. There is no correct way to compute these values from + test results that are summaries from more than 3 samples. + + The use case here is comparing tests results parsed from concatenated + log files from multiple runs of benchmark driver. + """ self.min = min(self.min, r.min) self.max = max(self.max, r.max) # self.median = None # unclear what to do here @@ -65,23 +95,31 @@ def push(x): (self.samples, self.mean, self.S_runtime) = state # Merging test results with up to 3 samples is exact - # TODO investigate how to best handle merge of higher sample counts - values = [r.min, r.max, r.median, r.mean][:min(r.samples, 4)] + values = [r.min, r.max, r.median][:min(r.samples, 3)] map(push, values) + # Column labels for header row in results table header = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS') - # Tuple of values formatted for display in results table: - # (name, min value, max value, mean value, max_rss) def values(self): - return (self.name, str(self.min), str(self.max), str(int(self.mean)), - str(self.max_rss) if self.max_rss else '-') - - -class ResultComparison: + """Values property for display in results table comparisons + in format: ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS'). + """ + return ( + self.name, + str(self.min), str(self.max), str(int(self.mean)), + str(self.max_rss) if self.max_rss else '—' + ) + + +class ResultComparison(object): + """ResultComparison compares MINs from new and old PerformanceTestResult. + It computes speedup ratio and improvement delta (%). + """ def __init__(self, old, new): self.old = old self.new = new + assert(old.name == new.name) self.name = old.name # Test name, convenience accessor # Speedup ratio @@ -91,27 +129,43 @@ def __init__(self, old, new): ratio = (new.min + 0.001) / (old.min + 0.001) self.delta = ((ratio - 1) * 100) - self.is_dubious = ( # FIXME this is legacy + # Add ' (?)' to the speedup column as indication of dubious changes: + # result's MIN falls inside the (MIN, MAX) interval of result they are + # being compared with. + self.is_dubious = ( ' (?)' if ((old.min < new.min and new.min < old.max) or (new.min < old.min and old.min < new.max)) else '') + # Column labels for header row in results table header = ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP') - # Tuple of values formatted for display in results table: - # (name, old value, new value, delta [%], speedup ratio) def values(self): - return (self.name, str(self.old.min), str(self.new.min), + """Values property for display in results table comparisons + in format: ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP'). + """ + return (self.name, + str(self.old.min), str(self.new.min), '{0:+.1f}%'.format(self.delta), '{0:.2f}x{1}'.format(self.ratio, self.is_dubious)) -class TestComparator: - def __init__(self, old_file, new_file, delta_threshold, changes_only): +class TestComparator(object): + """TestComparator parses `PerformanceTestResult`s from CSV log files. + Then it determines which tests were `added`, `removed` and which can be + compared. It then splits the `ResultComparison`s into 3 groups according to + the `delta_threshold` by the change in performance: `increased`, + `descreased` and `unchanged`. + + The lists of `added`, `removed` and `unchanged` tests are sorted + alphabetically. The `increased` and `decreased` lists are sorted in + descending order by the amount of change. + """ + def __init__(self, old_file, new_file, delta_threshold): def load_from_CSV(filename): # handles output from Benchmark_O and def skip_totals(row): # Benchmark_Driver (added MAX_RSS column) - return len(row) > 7 and row[0].isdigit() + return len(row) > 7 and row[0].isdigit() tests = map(PerformanceTestResult, filter(skip_totals, csv.reader(open(filename)))) @@ -131,9 +185,9 @@ def add_or_merge(names, r): added_tests = new_tests.difference(old_tests) removed_tests = old_tests.difference(new_tests) - self.added = sorted(map(lambda t: new_results[t], added_tests), + self.added = sorted([new_results[t] for t in added_tests], key=lambda r: r.name) - self.removed = sorted(map(lambda t: old_results[t], removed_tests), + self.removed = sorted([old_results[t] for t in removed_tests], key=lambda r: r.name) def compare(name): @@ -144,24 +198,28 @@ def compare(name): def partition(l, p): return reduce(lambda x, y: x[not p(y)].append(y) or x, l, ([], [])) - # TODO take standard deviation (SD) into account decreased, not_decreased = partition( comparisons, lambda c: c.ratio < (1 - delta_threshold)) increased, unchanged = partition( not_decreased, lambda c: c.ratio > (1 + delta_threshold)) # sorted partitions - names = map(lambda c: c.name, comparisons) + names = [c.name for c in comparisons] comparisons = dict(zip(names, comparisons)) - self.decreased = map(lambda c: comparisons[c.name], - sorted(decreased, key=lambda c: -c.delta)) - self.increased = map(lambda c: comparisons[c.name], - sorted(increased, key=lambda c: c.delta)) - self.unchanged = map(lambda c: comparisons[c.name], - sorted(unchanged, key=lambda c: c.name)) - - -class ReportFormatter: + self.decreased = [comparisons[c.name] + for c in sorted(decreased, key=lambda c: -c.delta)] + self.increased = [comparisons[c.name] + for c in sorted(increased, key=lambda c: c.delta)] + self.unchanged = [comparisons[c.name] + for c in sorted(unchanged, key=lambda c: c.name)] + + +class ReportFormatter(object): + """ReportFormatter formats the `PerformanceTestResult`s and + `ResultComparison`s provided by `TestComparator` using their `header` and + `values()` into report table. Supported formats are: `markdown` (used for + displaying benchmark results on GitHub), `git` (default) and `html`. + """ def __init__(self, comparator, old_branch, new_branch, changes_only): self.comparator = comparator self.old_branch = old_branch @@ -178,35 +236,34 @@ def __init__(self, comparator, old_branch, new_branch, changes_only): {0} ({1}): {2}""" def markdown(self): - return self.__formatted_text( + return self._formatted_text( ROW='{0} | {1} | {2} | {3} | {4} \n', HEADER_SEPARATOR='---', DETAIL=self.MARKDOWN_DETAIL) def git(self): - return self.__formatted_text( + return self._formatted_text( ROW='{0} {1} {2} {3} {4} \n', HEADER_SEPARATOR=' ', DETAIL=self.GIT_DETAIL) - def __column_widths(self): + def _column_widths(self): changed = self.comparator.decreased + self.comparator.increased comparisons = (changed if self.changes_only else changed + self.comparator.unchanged) comparisons += self.comparator.added + self.comparator.removed - values = map(lambda c: c.values(), comparisons) widths = map(lambda columns: map(len, columns), [PerformanceTestResult.header, ResultComparison.header] + - values) + [c.values() for c in comparisons]) def max_widths(maximum, widths): return tuple(map(max, zip(maximum, widths))) return reduce(max_widths, widths, tuple([0] * 5)) - def __formatted_text(self, ROW, HEADER_SEPARATOR, DETAIL): - widths = self.__column_widths() + def _formatted_text(self, ROW, HEADER_SEPARATOR, DETAIL): + widths = self._column_widths() def justify_columns(contents): return tuple(map(lambda (w, c): c.ljust(w), zip(widths, contents))) @@ -343,7 +400,7 @@ def main(): args = parser.parse_args() comparator = TestComparator(args.old_file, args.new_file, - float(args.delta_threshold), args.changes_only) + float(args.delta_threshold)) formatter = ReportFormatter(comparator, args.old_branch, args.new_branch, args.changes_only) @@ -372,9 +429,8 @@ def write_to_file(file_name, data): """ Write data to given file """ - file = open(file_name, 'w') - file.write(data) - file.close + with open(file_name, 'w') as f: + f.write(data) if __name__ == '__main__': diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py new file mode 100644 index 0000000000000..663ad04d01b26 --- /dev/null +++ b/benchmark/scripts/test_compare_perf_tests.py @@ -0,0 +1,229 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ===--- benchmark_utils.py ----------------------------------------------===// +# +# This source file is part of the Swift.org open source project +# +# Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors +# Licensed under Apache License v2.0 with Runtime Library Exception +# +# See https://swift.org/LICENSE.txt for license information +# See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +# +# ===---------------------------------------------------------------------===// + +import os +import shutil +import tempfile +import unittest + +from compare_perf_tests import PerformanceTestResult +from compare_perf_tests import ResultComparison +from compare_perf_tests import TestComparator + + +class TestPerformanceTestResult(unittest.TestCase): + + def test_init(self): + log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884' + r = PerformanceTestResult(log_line.split(',')) + self.assertEquals(r.name, 'AngryPhonebook') + self.assertEquals((r.samples, r.min, r.max, r.mean, r.sd, r.median), + (20, 10664, 12933, 11035, 576, 10884)) + + log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336' + r = PerformanceTestResult(log_line.split(',')) + self.assertEquals(r.max_rss, 10510336) + + def test_repr(self): + log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884' + r = PerformanceTestResult(log_line.split(',')) + self.assertEquals( + str(r), + '' + ) + + def test_header(self): + self.assertEquals(PerformanceTestResult.header, + ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')) + + def test_values(self): + log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884' + r = PerformanceTestResult(log_line.split(',')) + self.assertEquals( + r.values(), + ('AngryPhonebook', '10664', '12933', '11035', '—') + ) + log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336' + r = PerformanceTestResult(log_line.split(',')) + self.assertEquals( + r.values(), + ('AngryPhonebook', '12045', '12045', '12045', '10510336') + ) + + def test_merge(self): + tests = """1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336 +1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336 +1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144 +1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n') + results = map(PerformanceTestResult, + [line.split(',') for line in tests]) + + def as_tuple(r): + return (r.min, r.max, round(r.mean, 2), round(r.sd, 2), r.median, + r.max_rss) + + r = results[0] + self.assertEquals(as_tuple(r), + (12045, 12045, 12045, 0, 12045, 10510336)) + r.merge(results[1]) + self.assertEquals(as_tuple(r), + (12045, 12325, 12185, 197.99, 12045, 10510336)) + r.merge(results[2]) + self.assertEquals(as_tuple(r), + (11616, 12325, 11995.33, 357.10, 12045, 10510336)) + r.merge(results[3]) + self.assertEquals(as_tuple(r), + (11616, 12325, 12064, 322.29, 12045, 10510336)) + + +class TestResultComparison(unittest.TestCase): + def setUp(self): + self.r0 = PerformanceTestResult( + '101,GlobalClass,20,0,0,0,0,0,10185728'.split(',')) + self.r01 = PerformanceTestResult( + '101,GlobalClass,20,20,20,20,0,0,10185728'.split(',')) + self.r1 = PerformanceTestResult( + '1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336'.split(',')) + self.r2 = PerformanceTestResult( + '1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144'.split(',')) + + def test_init(self): + rc = ResultComparison(self.r1, self.r2) + self.assertEquals(rc.name, 'AngryPhonebook') + self.assertAlmostEquals(rc.ratio, 12325.0 / 11616.0) + self.assertAlmostEquals(rc.delta, (((11616.0 / 12325.0) - 1) * 100), + places=3) + # handle test results that sometimes change to zero, when compiler + # optimizes out the body of the incorrectly written test + rc = ResultComparison(self.r0, self.r0) + self.assertEquals(rc.name, 'GlobalClass') + self.assertAlmostEquals(rc.ratio, 1) + self.assertAlmostEquals(rc.delta, 0, places=3) + rc = ResultComparison(self.r0, self.r01) + self.assertAlmostEquals(rc.ratio, 0, places=3) + self.assertAlmostEquals(rc.delta, 2000000, places=3) + rc = ResultComparison(self.r01, self.r0) + self.assertAlmostEquals(rc.ratio, 20001) + self.assertAlmostEquals(rc.delta, -99.995, places=3) + # disallow comparison of different test results + self.assertRaises( + AssertionError, + ResultComparison, self.r0, self.r1 + ) + + def test_header(self): + self.assertEquals(ResultComparison.header, + ('TEST', 'OLD', 'NEW', 'DELTA', 'SPEEDUP')) + + def test_values(self): + rc = ResultComparison(self.r1, self.r2) + self.assertEquals( + rc.values(), + ('AngryPhonebook', '12325', '11616', '-5.8%', '1.06x') + ) + # other way around + rc = ResultComparison(self.r2, self.r1) + self.assertEquals( + rc.values(), + ('AngryPhonebook', '11616', '12325', '+6.1%', '0.94x') + ) + + def test_values_is_dubious(self): + self.r2.max = self.r1.min + 1 + # new.min < old.min < new.max + rc = ResultComparison(self.r1, self.r2) + self.assertEquals(rc.values()[4], '1.06x (?)') + # other way around: old.min < new.min < old.max + rc = ResultComparison(self.r2, self.r1) + self.assertEquals(rc.values()[4], '0.94x (?)') + + +class OldAndNewLog(unittest.TestCase): + def setUp(self): + # Create a temporary directory + self.test_dir = tempfile.mkdtemp() + + self.old_log = self.write_temp_file('old.log', """ +1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365 +2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445 +3,Array2D,20,335831,400221,346622,0,346622,28297216 +4,ArrayAppend,20,23641,29000,24990,0,24990,11149926 +34,BitCount,20,3,4,4,0,4,10192896 +35,ByteSwap,20,4,6,4,0,4,10185933 + +Totals,269,67351871,70727022,68220188,0,0,0 +""") + # Log lines are deliberately in descending sort order in order to test + # the ordering of the comparison results. + # ArrayAppend is included twice to test tesults merging. + self.new_log = self.write_temp_file('new.log', """ +265,TwoSum,20,5006,5679,5111,0,5111 +35,ByteSwap,20,0,0,0,0,0 +34,BitCount,20,9,9,9,0,9 +4,ArrayAppend,20,23641,29000,24990,0,24990 +3,Array2D,20,335831,400221,346622,0,346622 +1,AngryPhonebook,20,10458,12714,11000,0,11000 + +Totals,269,67351871,70727022,68220188,0,0,0 +4,ArrayAppend,1,20000,20000,20000,0,20000 + +Totals,1,20000,20000,20000,0,0,0 +""") + + def tearDown(self): + # Remove the directory after the test + shutil.rmtree(self.test_dir) + + def write_temp_file(self, file_name, data): + temp_file_name = os.path.join(self.test_dir, file_name) + with open(temp_file_name, 'w') as f: + f.write(data) + return temp_file_name + + +class TestTestComparator(OldAndNewLog): + def test_init(self): + old_log, new_log = self.old_log, self.new_log + + def names(tests): + return [t.name for t in tests] + + tc = TestComparator(old_log, new_log, 0.05) + self.assertEquals(names(tc.unchanged), ['AngryPhonebook', 'Array2D']) + self.assertEquals(names(tc.increased), ['ByteSwap', 'ArrayAppend']) + self.assertEquals(names(tc.decreased), ['BitCount']) + self.assertEquals(names(tc.added), ['TwoSum']) + self.assertEquals(names(tc.removed), ['AnyHashableWithAClass']) + # other way around + tc = TestComparator(new_log, old_log, 0.05) + self.assertEquals(names(tc.unchanged), ['AngryPhonebook', 'Array2D']) + self.assertEquals(names(tc.increased), ['BitCount']) + self.assertEquals(names(tc.decreased), ['ByteSwap', 'ArrayAppend']) + self.assertEquals(names(tc.added), ['AnyHashableWithAClass']) + self.assertEquals(names(tc.removed), ['TwoSum']) + # delta_threshold determines the sorting into change groups; + # report only change above 100% (ByteSwap's runtime went to 0): + tc = TestComparator(old_log, new_log, 1) + self.assertEquals( + names(tc.unchanged), + ['AngryPhonebook', 'Array2D', 'ArrayAppend', 'BitCount'] + ) + self.assertEquals(names(tc.increased), ['ByteSwap']) + self.assertEquals(tc.decreased, []) + + +if __name__ == '__main__': + unittest.main() diff --git a/validation-test/Python/benchmark-scripts.test-sh b/validation-test/Python/benchmark-scripts.test-sh new file mode 100644 index 0000000000000..16715ac4f3311 --- /dev/null +++ b/validation-test/Python/benchmark-scripts.test-sh @@ -0,0 +1,2 @@ +// RUN: %{python} -m unittest discover -s %swift_src_root/benchmark/scripts/ +// REQUIRES: OS=macosx From d178b6e0cdb4476830a152f7ab7e44acc114d345 Mon Sep 17 00:00:00 2001 From: Pavol Vaskovic Date: Thu, 1 Jun 2017 22:19:33 +0200 Subject: [PATCH 2/6] Improved coverage with more tests: parse_args Coverage at 66% according to coveragy.py --- benchmark/scripts/compare_perf_tests.py | 22 +++++--- benchmark/scripts/test_compare_perf_tests.py | 53 ++++++++++++++++++++ 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py index 7d51c68a90a42..9e68800f1a9d4 100755 --- a/benchmark/scripts/compare_perf_tests.py +++ b/benchmark/scripts/compare_perf_tests.py @@ -253,9 +253,11 @@ def _column_widths(self): changed + self.comparator.unchanged) comparisons += self.comparator.added + self.comparator.removed - widths = map(lambda columns: map(len, columns), - [PerformanceTestResult.header, ResultComparison.header] + - [c.values() for c in comparisons]) + widths = [ + map(len, columns) for columns in + [PerformanceTestResult.header, ResultComparison.header] + + [c.values() for c in comparisons] + ] def max_widths(maximum, widths): return tuple(map(max, zip(maximum, widths))) @@ -375,8 +377,8 @@ def table(title, results, speedup_color): ])) -def main(): - +def parse_args(args): + """Parse command line arguments and set default values.""" parser = argparse.ArgumentParser(description='Compare Performance tests.') parser.add_argument('--old-file', help='Baseline performance test suite (csv file)', @@ -396,11 +398,15 @@ def main(): parser.add_argument('--old-branch', help='Name of the old branch', default='OLD_MIN') parser.add_argument('--delta-threshold', - help='Delta threshold. Default 0.05.', default='0.05') + help='Delta threshold. Default 0.05.', + type=float, default=0.05) + return parser.parse_args(args) - args = parser.parse_args() + +def main(): + args = parse_args(sys.argv[1:]) comparator = TestComparator(args.old_file, args.new_file, - float(args.delta_threshold)) + args.delta_threshold) formatter = ReportFormatter(comparator, args.old_branch, args.new_branch, args.changes_only) diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py index 663ad04d01b26..09078d76f4c58 100644 --- a/benchmark/scripts/test_compare_perf_tests.py +++ b/benchmark/scripts/test_compare_perf_tests.py @@ -21,6 +21,7 @@ from compare_perf_tests import PerformanceTestResult from compare_perf_tests import ResultComparison from compare_perf_tests import TestComparator +from compare_perf_tests import parse_args class TestPerformanceTestResult(unittest.TestCase): @@ -225,5 +226,57 @@ def names(tests): self.assertEquals(tc.decreased, []) +class Test_parse_args(unittest.TestCase): + required = ['--old-file', 'old.log', '--new-file', 'new.log'] + + def test_required_input_arguments(self): + args = parse_args(self.required) + self.assertEquals(args.old_file, 'old.log') + self.assertEquals(args.new_file, 'new.log') + self.assertRaises(SystemExit, parse_args, []) + + def test_format_argument(self): + self.assertEquals( + parse_args(self.required + ['--format', 'markdown']).format, + 'markdown') + self.assertEquals( + parse_args(self.required + ['--format', 'git']).format, 'git') + self.assertEquals( + parse_args(self.required + ['--format', 'html']).format, 'html') + self.assertRaises(SystemExit, parse_args, + self.required + ['--format', 'bogus']) + + def test_delta_threshold_argument(self): + # default value + args = parse_args(self.required) + self.assertEquals(args.delta_threshold, 0.05) + # float parsing + args = parse_args(self.required + ['--delta-threshold', '0.1']) + self.assertEquals(args.delta_threshold, 0.1) + args = parse_args(self.required + ['--delta-threshold', '1']) + self.assertEquals(args.delta_threshold, 1.0) + args = parse_args(self.required + ['--delta-threshold', '.2']) + self.assertEquals(args.delta_threshold, 0.2) + self.assertRaises(SystemExit, parse_args, + self.required + ['--delta-threshold', '2,2']) + + def test_changes_only_argument(self): + self.assertFalse(parse_args(self.required).changes_only) + self.assertTrue(parse_args(self.required + + ['--changes-only']).changes_only) + + def test_branch_arguments(self): + # default value + args = parse_args(self.required) + self.assertEquals(args.new_branch, 'NEW_MIN') + self.assertEquals(args.old_branch, 'OLD_MIN') + # user specified + args = parse_args( + self.required + ['--old-branch', 'master', + '--new-branch', 'amazing-optimization']) + self.assertEquals(args.old_branch, 'master') + self.assertEquals(args.new_branch, 'amazing-optimization') + + if __name__ == '__main__': unittest.main() From 9265a71ac6407fd683a086e0737b2e74a901d839 Mon Sep 17 00:00:00 2001 From: Pavol Vaskovic Date: Fri, 2 Jun 2017 02:22:21 +0200 Subject: [PATCH 3/6] Improved coverage: ReportFormatter Coverage at 87% according to coveragy.py Also fixed spelling errors in documentation. --- benchmark/scripts/compare_perf_tests.py | 10 +- benchmark/scripts/test_compare_perf_tests.py | 150 +++++++++++++++++++ 2 files changed, 155 insertions(+), 5 deletions(-) diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py index 9e68800f1a9d4..74c3a7a134947 100755 --- a/benchmark/scripts/compare_perf_tests.py +++ b/benchmark/scripts/compare_perf_tests.py @@ -22,14 +22,14 @@ class PerformanceTestResult(object): - """PerformanceTestResult holds results from executing individual benchmark - from Swift Benchmark Suite as reported by test driver (Benchmark_O, - Benchmark_Onone, Benchmark_Ounchecked or Benchmark_Driver). + """PerformanceTestResult holds results from executing an individual + benchmark from the Swift Benchmark Suite as reported by the test driver + (Benchmark_O, Benchmark_Onone, Benchmark_Ounchecked or Benchmark_Driver). - It depends on the log format emmited by the test driver in the form: + It depends on the log format emitted by the test driver in the form: #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B) - The last column, MAX_RSS, is emmited only for runs instrumented by the + The last column, MAX_RSS, is emitted only for runs instrumented by the Benchmark_Driver to measure rough memory use during the execution of the benchmark. """ diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py index 09078d76f4c58..1d319b4c1f403 100644 --- a/benchmark/scripts/test_compare_perf_tests.py +++ b/benchmark/scripts/test_compare_perf_tests.py @@ -19,6 +19,7 @@ import unittest from compare_perf_tests import PerformanceTestResult +from compare_perf_tests import ReportFormatter from compare_perf_tests import ResultComparison from compare_perf_tests import TestComparator from compare_perf_tests import parse_args @@ -226,6 +227,149 @@ def names(tests): self.assertEquals(tc.decreased, []) +class TestReportFormatter(OldAndNewLog): + def setUp(self): + super(TestReportFormatter, self).setUp() + self.tc = TestComparator(self.old_log, self.new_log, 0.05) + self.rf = ReportFormatter(self.tc, '', '', changes_only=False) + self.markdown = self.rf.markdown() + self.git = self.rf.git() + self.html = self.rf.html() + + def assert_report_contains(self, texts, report): + # assert not isinstance(texts, str) + if isinstance(texts, str): + self.assertIn(texts, report) + else: + for text in texts: + self.assertIn(text, report) + + def assert_markdown_contains(self, texts): + self.assert_report_contains(texts, self.markdown) + + def assert_git_contains(self, texts): + self.assert_report_contains(texts, self.git) + + def assert_html_contains(self, texts): + self.assert_report_contains(texts, self.html) + + def test_justified_columns(self): + """Table columns are all formated with same width, defined by the + longest value. + """ + print self.markdown + self.assert_markdown_contains([ + 'AnyHashableWithAClass | 247027 | 319065 | 259056 | 10250445', + 'Array2D | 335831 | 335831 | +0.0% | 1.00x']) + self.assert_git_contains([ + 'AnyHashableWithAClass 247027 319065 259056 10250445', + 'Array2D 335831 335831 +0.0% 1.00x']) + + def test_column_headers(self): + """Report contains table headers for ResultComparisons and changed + PerformanceTestResults. + """ + print self.git + self.assert_markdown_contains([ + 'TEST | OLD | NEW | DELTA | SPEEDUP', + '--- | --- | --- | --- | --- ', + 'TEST | MIN | MAX | MEAN | MAX_RSS']) + self.assert_git_contains([ + 'TEST OLD NEW DELTA SPEEDUP', + 'TEST MIN MAX MEAN MAX_RSS']) + self.assert_html_contains([ + """ + OLD + NEW + DELTA + SPEEDUP""", + """ + MIN + MAX + MEAN + MAX_RSS"""]) + + def test_emphasize_speedup(self): + """Emphasize speedup values for regressions and improvements""" + # tests in No Changes don't have emphasized speedup + self.assert_markdown_contains([ + 'BitCount | 3 | 9 | +199.9% | **0.33x**', + 'ByteSwap | 4 | 0 | -100.0% | **4001.00x**', + 'AngryPhonebook | 10458 | 10458 | +0.0% | 1.00x ', + 'ArrayAppend | 23641 | 20000 | -15.4% | **1.18x (?)**' + ]) + self.assert_git_contains([ + 'BitCount 3 9 +199.9% **0.33x**', + 'ByteSwap 4 0 -100.0% **4001.00x**', + 'AngryPhonebook 10458 10458 +0.0% 1.00x', + 'ArrayAppend 23641 20000 -15.4% **1.18x (?)**' + ]) + print self.html + self.assert_html_contains([ + """ + + BitCount + 3 + 9 + +199.9% + 0.33x + """, + """ + + ByteSwap + 4 + 0 + -100.0% + 4001.00x + """, + """ + + AngryPhonebook + 10458 + 10458 + +0.0% + 1.00x + """ + ]) + + def test_sections(self): + """Report is divided into sections with summaries.""" + self.assert_markdown_contains([ + """
+ Regression (1)""", + """
+ Improvement (2)""", + """
+ No Changes (2)""", + """
+ Added (1)""", + """
+ Removed (1)"""]) + self.assert_git_contains([ + 'Regression (1): \n', + 'Improvement (2): \n', + 'No Changes (2): \n', + 'Added (1): \n', + 'Removed (1): \n']) + self.assert_html_contains([ + "Regression (1)", + "Improvement (2)", + "No Changes (2)", + "Added (1)", + "Removed (1)"]) + + def test_report_only_changes(self): + """Leave out tests without significant change.""" + rf = ReportFormatter(self.tc, '', '', changes_only=True) + markdown, git, html = rf.markdown(), rf.git(), rf.html() + self.assertNotIn('No Changes', markdown) + self.assertNotIn('AngryPhonebook', markdown) + self.assertNotIn('No Changes', git) + self.assertNotIn('AngryPhonebook', git) + self.assertNotIn('No Changes', html) + self.assertNotIn('AngryPhonebook', html) + + class Test_parse_args(unittest.TestCase): required = ['--old-file', 'old.log', '--new-file', 'new.log'] @@ -260,6 +404,12 @@ def test_delta_threshold_argument(self): self.assertRaises(SystemExit, parse_args, self.required + ['--delta-threshold', '2,2']) + def test_output_argument(self): + self.assertEquals(parse_args(self.required).output, None) + self.assertEquals(parse_args(self.required + + ['--output', 'report.log']).output, + 'report.log') + def test_changes_only_argument(self): self.assertFalse(parse_args(self.required).changes_only) self.assertTrue(parse_args(self.required + From dea7d8fe775cd13208fe42e719f944d87061529c Mon Sep 17 00:00:00 2001 From: Pavol Vaskovic Date: Sun, 4 Jun 2017 18:31:06 +0200 Subject: [PATCH 4/6] Consistent --output; Improved coverage: main() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coverage at 99% according to coverage.py * `compare_perf_tests.py` now always outputs the same format to stdout as is written to `--output` file * Added integration test for the main() function * Added tests for console output (and suppressed it leaking during testing) * Fixed file name in test’s file header --- benchmark/scripts/compare_perf_tests.py | 42 ++---- benchmark/scripts/test_compare_perf_tests.py | 133 ++++++++++++++++--- 2 files changed, 129 insertions(+), 46 deletions(-) diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py index 74c3a7a134947..dafe841d5287b 100755 --- a/benchmark/scripts/compare_perf_tests.py +++ b/benchmark/scripts/compare_perf_tests.py @@ -268,7 +268,7 @@ def _formatted_text(self, ROW, HEADER_SEPARATOR, DETAIL): widths = self._column_widths() def justify_columns(contents): - return tuple(map(lambda (w, c): c.ljust(w), zip(widths, contents))) + return tuple([c.ljust(w) for w, c in zip(widths, contents)]) def row(contents): return ROW.format(*justify_columns(contents)) @@ -409,34 +409,18 @@ def main(): args.delta_threshold) formatter = ReportFormatter(comparator, args.old_branch, args.new_branch, args.changes_only) - - if args.format: - if args.format.lower() != 'markdown': - print(formatter.git()) - else: - print(formatter.markdown()) - - if args.format: - if args.format.lower() == 'html': - if args.output: - write_to_file(args.output, formatter.html()) - else: - print('Error: missing --output flag.') - sys.exit(1) - elif args.format.lower() == 'markdown': - if args.output: - write_to_file(args.output, formatter.markdown()) - elif args.format.lower() != 'git': - print('{0} is unknown format.'.format(args.format)) - sys.exit(1) - - -def write_to_file(file_name, data): - """ - Write data to given file - """ - with open(file_name, 'w') as f: - f.write(data) + formats = { + 'markdown': formatter.markdown, + 'git': formatter.git, + 'html': formatter.html + } + + report = formats[args.format]() + print(report) + + if args.output: + with open(args.output, 'w') as f: + f.write(report) if __name__ == '__main__': diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py index 1d319b4c1f403..46b3d3400ef20 100644 --- a/benchmark/scripts/test_compare_perf_tests.py +++ b/benchmark/scripts/test_compare_perf_tests.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -# ===--- benchmark_utils.py ----------------------------------------------===// +# ===--- test_compare_perf_tests.py --------------------------------------===// # # This source file is part of the Swift.org open source project # @@ -15,16 +15,32 @@ import os import shutil +import sys import tempfile import unittest +from StringIO import StringIO +from contextlib import contextmanager + from compare_perf_tests import PerformanceTestResult from compare_perf_tests import ReportFormatter from compare_perf_tests import ResultComparison from compare_perf_tests import TestComparator +from compare_perf_tests import main from compare_perf_tests import parse_args +@contextmanager +def captured_output(): + new_out, new_err = StringIO(), StringIO() + old_out, old_err = sys.stdout, sys.stderr + try: + sys.stdout, sys.stderr = new_out, new_err + yield sys.stdout, sys.stderr + finally: + sys.stdout, sys.stderr = old_out, old_err + + class TestPerformanceTestResult(unittest.TestCase): def test_init(self): @@ -195,6 +211,11 @@ def write_temp_file(self, file_name, data): f.write(data) return temp_file_name + def assert_report_contains(self, texts, report): + assert not isinstance(texts, str) + for text in texts: + self.assertIn(text, report) + class TestTestComparator(OldAndNewLog): def test_init(self): @@ -236,14 +257,6 @@ def setUp(self): self.git = self.rf.git() self.html = self.rf.html() - def assert_report_contains(self, texts, report): - # assert not isinstance(texts, str) - if isinstance(texts, str): - self.assertIn(texts, report) - else: - for text in texts: - self.assertIn(text, report) - def assert_markdown_contains(self, texts): self.assert_report_contains(texts, self.markdown) @@ -257,7 +270,6 @@ def test_justified_columns(self): """Table columns are all formated with same width, defined by the longest value. """ - print self.markdown self.assert_markdown_contains([ 'AnyHashableWithAClass | 247027 | 319065 | 259056 | 10250445', 'Array2D | 335831 | 335831 | +0.0% | 1.00x']) @@ -269,7 +281,6 @@ def test_column_headers(self): """Report contains table headers for ResultComparisons and changed PerformanceTestResults. """ - print self.git self.assert_markdown_contains([ 'TEST | OLD | NEW | DELTA | SPEEDUP', '--- | --- | --- | --- | --- ', @@ -304,7 +315,6 @@ def test_emphasize_speedup(self): 'AngryPhonebook 10458 10458 +0.0% 1.00x', 'ArrayAppend 23641 20000 -15.4% **1.18x (?)**' ]) - print self.html self.assert_html_contains([ """ @@ -374,12 +384,16 @@ class Test_parse_args(unittest.TestCase): required = ['--old-file', 'old.log', '--new-file', 'new.log'] def test_required_input_arguments(self): + with captured_output() as (_, err): + self.assertRaises(SystemExit, parse_args, []) + self.assertIn('usage: compare_perf_tests.py', err.getvalue()) + args = parse_args(self.required) self.assertEquals(args.old_file, 'old.log') self.assertEquals(args.new_file, 'new.log') - self.assertRaises(SystemExit, parse_args, []) def test_format_argument(self): + self.assertEquals(parse_args(self.required).format, 'markdown') self.assertEquals( parse_args(self.required + ['--format', 'markdown']).format, 'markdown') @@ -387,8 +401,13 @@ def test_format_argument(self): parse_args(self.required + ['--format', 'git']).format, 'git') self.assertEquals( parse_args(self.required + ['--format', 'html']).format, 'html') - self.assertRaises(SystemExit, parse_args, - self.required + ['--format', 'bogus']) + + with captured_output() as (_, err): + self.assertRaises(SystemExit, parse_args, + self.required + ['--format', 'bogus']) + self.assertIn("error: argument --format: invalid choice: 'bogus' " + "(choose from 'markdown', 'git', 'html')", + err.getvalue()) def test_delta_threshold_argument(self): # default value @@ -401,8 +420,13 @@ def test_delta_threshold_argument(self): self.assertEquals(args.delta_threshold, 1.0) args = parse_args(self.required + ['--delta-threshold', '.2']) self.assertEquals(args.delta_threshold, 0.2) - self.assertRaises(SystemExit, parse_args, - self.required + ['--delta-threshold', '2,2']) + + with captured_output() as (_, err): + self.assertRaises(SystemExit, parse_args, + self.required + ['--delta-threshold', '2,2']) + self.assertIn(" error: argument --delta-threshold: invalid float " + "value: '2,2'", + err.getvalue()) def test_output_argument(self): self.assertEquals(parse_args(self.required).output, None) @@ -428,5 +452,80 @@ def test_branch_arguments(self): self.assertEquals(args.new_branch, 'amazing-optimization') +class Test_compare_perf_tests_main(OldAndNewLog): + """Integration test that invokes the whole comparison script.""" + markdown = [ + 'Regression (1)', + 'TEST | OLD | NEW | DELTA | SPEEDUP', + 'BitCount | 3 | 9 | +199.9% | **0.33x**', + ] + git = [ + 'Regression (1):', + 'TEST OLD NEW DELTA SPEEDUP', + 'BitCount 3 9 +199.9% **0.33x**', + ] + html = ['', "BitCount"] + + def execute_main_with_format(self, report_format, test_output=False): + report_file = self.test_dir + 'report.log' + args = ['compare_perf_tests.py', + '--old-file', self.old_log, + '--new-file', self.new_log, + '--format', report_format] + + sys.argv = (args if not test_output else + args + ['--output', report_file]) + + with captured_output() as (out, _): + main() + report_out = out.getvalue() + + if test_output: + with open(report_file, 'r') as f: + report = f.read() + # because print adds newline, add one here, too: + report_file = str(report + '\n') + else: + report_file = None + + return report_out, report_file + + def test_markdown(self): + """Writes Markdown formatted report to stdout""" + report_out, _ = self.execute_main_with_format('markdown') + self.assert_report_contains(self.markdown, report_out) + + def test_markdown_output(self): + """Writes Markdown formatted report to stdout and `--output` file.""" + report_out, report_file = ( + self.execute_main_with_format('markdown', test_output=True)) + self.assertEquals(report_out, report_file) + self.assert_report_contains(self.markdown, report_file) + + def test_git(self): + """Writes Git formatted report to stdout.""" + report_out, _ = self.execute_main_with_format('git') + self.assert_report_contains(self.git, report_out) + + def test_git_output(self): + """Writes Git formatted report to stdout and `--output` file.""" + report_out, report_file = ( + self.execute_main_with_format('git', test_output=True)) + self.assertEquals(report_out, report_file) + self.assert_report_contains(self.git, report_file) + + def test_html(self): + """Writes HTML formatted report to stdout.""" + report_out, _ = self.execute_main_with_format('html') + self.assert_report_contains(self.html, report_out) + + def test_html_output(self): + """Writes HTML formatted report to stdout and `--output` file.""" + report_out, report_file = ( + self.execute_main_with_format('html', test_output=True)) + self.assertEquals(report_out, report_file) + self.assert_report_contains(self.html, report_file) + + if __name__ == '__main__': unittest.main() From e7b243cad7e27af63e6c25cd426fdc5359c4e51d Mon Sep 17 00:00:00 2001 From: Pavol Vaskovic Date: Sun, 4 Jun 2017 18:40:20 +0200 Subject: [PATCH 5/6] Fixed false statement in documentation. --- benchmark/scripts/compare_perf_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py index dafe841d5287b..d99a6e2a985e2 100755 --- a/benchmark/scripts/compare_perf_tests.py +++ b/benchmark/scripts/compare_perf_tests.py @@ -218,7 +218,7 @@ class ReportFormatter(object): """ReportFormatter formats the `PerformanceTestResult`s and `ResultComparison`s provided by `TestComparator` using their `header` and `values()` into report table. Supported formats are: `markdown` (used for - displaying benchmark results on GitHub), `git` (default) and `html`. + displaying benchmark results on GitHub), `git` and `html`. """ def __init__(self, comparator, old_branch, new_branch, changes_only): self.comparator = comparator From 686c7619922d0de2bfdb3a0ec988b2c3c6bb60b5 Mon Sep 17 00:00:00 2001 From: Pavol Vaskovic Date: Sun, 4 Jun 2017 18:48:19 +0200 Subject: [PATCH 6/6] One more typo fix. --- benchmark/scripts/test_compare_perf_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py index 46b3d3400ef20..fc7f4e4454e56 100644 --- a/benchmark/scripts/test_compare_perf_tests.py +++ b/benchmark/scripts/test_compare_perf_tests.py @@ -186,7 +186,7 @@ def setUp(self): """) # Log lines are deliberately in descending sort order in order to test # the ordering of the comparison results. - # ArrayAppend is included twice to test tesults merging. + # ArrayAppend is included twice to test results merging. self.new_log = self.write_temp_file('new.log', """ 265,TwoSum,20,5006,5679,5111,0,5111 35,ByteSwap,20,0,0,0,0,0