From 989369142330e6c965e7731b0d4568c245f3e284 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Mon, 2 Jun 2025 12:53:16 -0700 Subject: [PATCH 01/12] Implement EWMA --- devops/actions/run-tests/benchmark/action.yml | 3 ++ devops/scripts/benchmarks/compare.py | 17 +++++--- devops/scripts/benchmarks/options.py | 2 + .../benchmarks/tests/test_aggregate.py | 39 +++++++++++++++++++ devops/scripts/benchmarks/utils/aggregate.py | 30 ++++++++++++++ 5 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 devops/scripts/benchmarks/tests/test_aggregate.py diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml index 62dbe4fd4a1bc..6593d3b6816e3 100644 --- a/devops/actions/run-tests/benchmark/action.yml +++ b/devops/actions/run-tests/benchmark/action.yml @@ -157,8 +157,11 @@ runs: --preset "$PRESET" \ --timestamp-override "$SAVE_TIMESTAMP" \ --detect-version sycl,compute_runtime + echo "-----" python3 ./devops/scripts/benchmarks/compare.py to_hist \ + --avg-type EWMA \ + --cutoff "$(date -u -d '30 days ago' +'%Y%m%d_%H%M%S)" \ --name "$SAVE_NAME" \ --compare-file "./llvm-ci-perf-results/results/${SAVE_NAME}_${SAVE_TIMESTAMP}.json" \ --results-dir "./llvm-ci-perf-results/results/" diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py index eea5e450e6729..22ed498800742 100644 --- a/devops/scripts/benchmarks/compare.py +++ b/devops/scripts/benchmarks/compare.py @@ -254,8 +254,12 @@ def to_hist( from the average for this benchmark run. """ - if avg_type != "median": - print("Only median is currently supported: Refusing to continue.") + if avg_type == "median": + aggregator_type = SimpleMedian + elif avg_type == "EWMA": + aggregator_type = EWMA + else: + print("Error: Unsupported avg_type f{avg_type}.") exit(1) try: @@ -281,6 +285,7 @@ def to_hist( result_dir, compare_result.hostname, cutoff, + aggregator=aggregator_type, exclude=[Path(compare_file).stem], ) return Compare.to_hist_avg(hist_avg, compare_result) @@ -323,14 +328,14 @@ def to_hist( args = parser.parse_args() if args.operation == "to_hist": - if args.avg_type != "median": - print("Only median is currently supported: exiting.") - exit(1) if not Validate.timestamp(args.cutoff): raise ValueError("Timestamp must be provided as YYYYMMDD_HHMMSS.") + if args.avg_type not in ["median", "EWMA"]: + print("Only median is currently supported: exiting.") + exit(1) improvements, regressions = Compare.to_hist( - "median", args.name, args.compare_file, args.results_dir, args.cutoff + args.avg_type, args.name, args.compare_file, args.results_dir, args.cutoff ) def print_regression(entry: dict): diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index c0b385ebead17..1803dfa46d645 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -88,6 +88,8 @@ class Options: # CI scripts vs SYCl build source. github_repo_override: str = None git_commit_override: str = None + # EWMA Options: + EWMA_smoothing_factor: float = 0.2 detect_versions: DetectVersionsOptions = field( default_factory=DetectVersionsOptions diff --git a/devops/scripts/benchmarks/tests/test_aggregate.py b/devops/scripts/benchmarks/tests/test_aggregate.py new file mode 100644 index 0000000000000..b7f13f642e863 --- /dev/null +++ b/devops/scripts/benchmarks/tests/test_aggregate.py @@ -0,0 +1,39 @@ +import sys +import os + +sys.path.append(f"{os.path.dirname(__file__)}/../") +from options import options +from utils.aggregate import * + +def run_testcase(aggregator: Aggregator, src: list, expected: float) -> bool: + aggr = aggregator() + for n in src: + aggr.add(n) + res = aggr.get_avg() + if res != expected: + print(f"Failed: {aggregator}, {src} -- expected {expected}, got {res}") + return False + return True + + +def test_EWMA(): + options.EWMA_smoothing_factor = 0.5 + testcases = [ + ([], None), + ([100], 100), + ([100, 100, 100, 100, 100], 100), + ([100, 105, 103, 108, 107], 106.1875), + ] + successes = 0 + fails = 0 + for t in testcases: + if not run_testcase(EWMA, *t): + fails = fails + 1 + else: + successes = successes + 1 + print(f"EWMA test: {successes} successes, {fails} fails.") + + +if __name__ == "__main__": + test_EWMA() + diff --git a/devops/scripts/benchmarks/utils/aggregate.py b/devops/scripts/benchmarks/utils/aggregate.py index 36ee7cbecaae6..249ef1de952b1 100644 --- a/devops/scripts/benchmarks/utils/aggregate.py +++ b/devops/scripts/benchmarks/utils/aggregate.py @@ -1,6 +1,8 @@ import statistics from abc import ABC, abstractmethod +from options import options + class Aggregator(ABC): """ @@ -51,3 +53,31 @@ def add(self, n: float): def get_avg(self) -> float: return statistics.median(self.elements) + + +class EWMA(Aggregator): + """ + Exponentially weighted moving average based on all elements added to the + aggregator. + """ + + def __init__(self, starting_elements: list = []): + self.elements = starting_elements + + @staticmethod + def get_type() -> str: + return "EWMA" + + def add(self, n: float): + self.elements.append(n) + + def get_avg(self) -> float: + if len(self.elements) == 0: + return None # No elements collected, cannot provide an average + + alpha = options.EWMA_smoothing_factor + ewma_t = self.elements[0] + for x_t in self.elements[1:]: + ewma_t = alpha * x_t + (1 - alpha) * ewma_t + return ewma_t + From 7109f8a05a2ed7ef6884d40b87d29dbca9c6864e Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Fri, 6 Jun 2025 08:10:10 -0700 Subject: [PATCH 02/12] Fix typo --- devops/actions/run-tests/benchmark/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml index 6593d3b6816e3..fed4188eefa82 100644 --- a/devops/actions/run-tests/benchmark/action.yml +++ b/devops/actions/run-tests/benchmark/action.yml @@ -161,7 +161,7 @@ runs: echo "-----" python3 ./devops/scripts/benchmarks/compare.py to_hist \ --avg-type EWMA \ - --cutoff "$(date -u -d '30 days ago' +'%Y%m%d_%H%M%S)" \ + --cutoff "$(date -u -d '30 days ago' +'%Y%m%d_%H%M%S')" \ --name "$SAVE_NAME" \ --compare-file "./llvm-ci-perf-results/results/${SAVE_NAME}_${SAVE_TIMESTAMP}.json" \ --results-dir "./llvm-ci-perf-results/results/" From d6e7e66a2e5682d5730fb3f97fc6c136ef67235e Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Fri, 6 Jun 2025 09:32:02 -0700 Subject: [PATCH 03/12] Fix import --- devops/scripts/benchmarks/compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py index 22ed498800742..46b8a8e7aab30 100644 --- a/devops/scripts/benchmarks/compare.py +++ b/devops/scripts/benchmarks/compare.py @@ -1,4 +1,4 @@ -from utils.aggregate import Aggregator, SimpleMedian +from utils.aggregate import Aggregator, SimpleMedian, EWMA from utils.validate import Validate from utils.result import Result, BenchmarkRun from options import options From 6b52f0f29757901f6cf02de74c8007ed64033bb2 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Fri, 6 Jun 2025 11:16:33 -0700 Subject: [PATCH 04/12] add verbose output --- devops/scripts/benchmarks/compare.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py index 46b8a8e7aab30..5744fbad7ef32 100644 --- a/devops/scripts/benchmarks/compare.py +++ b/devops/scripts/benchmarks/compare.py @@ -12,6 +12,8 @@ from dataclasses import dataclass, asdict +verbose = False + @dataclass class BenchmarkHistoricAverage: """Contains historic average information for 1 benchmark""" @@ -224,6 +226,9 @@ def perf_diff_entry() -> dict: elif halfway_round(delta, 2) < -options.regression_threshold: regression.append(perf_diff_entry()) + if verbose: + print(f"{test.name}: expect {hist_avg[test.name].value}, got {test.value}") + return improvement, regression def to_hist( @@ -324,9 +329,19 @@ def to_hist( help="Timestamp (in YYYYMMDD_HHMMSS) of oldest result to include in historic average calculation", default="20000101_010101", ) + parser_avg.add_argument( + "--verbose", + action='store_true', + help="Increase output verbosity", + ) args = parser.parse_args() + if args.verbose: + global verbose + verbose = True + print("-- Compare.py --") + if args.operation == "to_hist": if not Validate.timestamp(args.cutoff): raise ValueError("Timestamp must be provided as YYYYMMDD_HHMMSS.") From d00c6bb649c8cfa897ae996fc421928737180481 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Fri, 6 Jun 2025 11:59:09 -0700 Subject: [PATCH 05/12] fix bug --- devops/scripts/benchmarks/compare.py | 1 - 1 file changed, 1 deletion(-) diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py index 5744fbad7ef32..cd0178a2f8277 100644 --- a/devops/scripts/benchmarks/compare.py +++ b/devops/scripts/benchmarks/compare.py @@ -338,7 +338,6 @@ def to_hist( args = parser.parse_args() if args.verbose: - global verbose verbose = True print("-- Compare.py --") From 5f2758dee43e1aa89eb85b0dee05127a15ed32b7 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Fri, 6 Jun 2025 12:40:03 -0700 Subject: [PATCH 06/12] use verbose option --- devops/actions/run-tests/benchmark/action.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml index fed4188eefa82..19bb1ee4494df 100644 --- a/devops/actions/run-tests/benchmark/action.yml +++ b/devops/actions/run-tests/benchmark/action.yml @@ -164,7 +164,8 @@ runs: --cutoff "$(date -u -d '30 days ago' +'%Y%m%d_%H%M%S')" \ --name "$SAVE_NAME" \ --compare-file "./llvm-ci-perf-results/results/${SAVE_NAME}_${SAVE_TIMESTAMP}.json" \ - --results-dir "./llvm-ci-perf-results/results/" + --results-dir "./llvm-ci-perf-results/results/" \ + --verbose echo "-----" - name: Cache changes to benchmark folder for archival purposes From 354174ec9ee855b3ec71fe5f8706fb59cea41346 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Fri, 6 Jun 2025 14:43:38 -0700 Subject: [PATCH 07/12] formatting --- devops/scripts/benchmarks/compare.py | 6 ++++-- devops/scripts/benchmarks/tests/test_aggregate.py | 2 +- devops/scripts/benchmarks/utils/aggregate.py | 3 +-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py index cd0178a2f8277..90272623fe7df 100644 --- a/devops/scripts/benchmarks/compare.py +++ b/devops/scripts/benchmarks/compare.py @@ -227,7 +227,9 @@ def perf_diff_entry() -> dict: regression.append(perf_diff_entry()) if verbose: - print(f"{test.name}: expect {hist_avg[test.name].value}, got {test.value}") + print( + f"{test.name}: expect {hist_avg[test.name].value}, got {test.value}" + ) return improvement, regression @@ -331,7 +333,7 @@ def to_hist( ) parser_avg.add_argument( "--verbose", - action='store_true', + action="store_true", help="Increase output verbosity", ) diff --git a/devops/scripts/benchmarks/tests/test_aggregate.py b/devops/scripts/benchmarks/tests/test_aggregate.py index b7f13f642e863..4b75f16fdbff4 100644 --- a/devops/scripts/benchmarks/tests/test_aggregate.py +++ b/devops/scripts/benchmarks/tests/test_aggregate.py @@ -5,6 +5,7 @@ from options import options from utils.aggregate import * + def run_testcase(aggregator: Aggregator, src: list, expected: float) -> bool: aggr = aggregator() for n in src: @@ -36,4 +37,3 @@ def test_EWMA(): if __name__ == "__main__": test_EWMA() - diff --git a/devops/scripts/benchmarks/utils/aggregate.py b/devops/scripts/benchmarks/utils/aggregate.py index 249ef1de952b1..8938320536593 100644 --- a/devops/scripts/benchmarks/utils/aggregate.py +++ b/devops/scripts/benchmarks/utils/aggregate.py @@ -73,11 +73,10 @@ def add(self, n: float): def get_avg(self) -> float: if len(self.elements) == 0: - return None # No elements collected, cannot provide an average + return None # No elements collected, cannot provide an average alpha = options.EWMA_smoothing_factor ewma_t = self.elements[0] for x_t in self.elements[1:]: ewma_t = alpha * x_t + (1 - alpha) * ewma_t return ewma_t - From 96130f5c4aec643e9518c7ee41b648d8e11f3a3f Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Tue, 10 Jun 2025 07:24:44 -0700 Subject: [PATCH 08/12] fix message not including ewma --- devops/scripts/benchmarks/compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py index 90272623fe7df..5ac55602bcdb6 100644 --- a/devops/scripts/benchmarks/compare.py +++ b/devops/scripts/benchmarks/compare.py @@ -347,7 +347,7 @@ def to_hist( if not Validate.timestamp(args.cutoff): raise ValueError("Timestamp must be provided as YYYYMMDD_HHMMSS.") if args.avg_type not in ["median", "EWMA"]: - print("Only median is currently supported: exiting.") + print("Only median, EWMA is currently supported: exiting.") exit(1) improvements, regressions = Compare.to_hist( From fc345dc98417e1100aa3cbcdd4110f5400d8f80b Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Wed, 11 Jun 2025 11:42:39 -0700 Subject: [PATCH 09/12] Lower smoothing factor --- devops/scripts/benchmarks/options.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index d853896a2e570..84bbc1301deec 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -72,6 +72,7 @@ class Options: build_jobs: int = multiprocessing.cpu_count() # Options intended for CI: + regression_threshold: float = 0.05 # It's necessary in CI to compare or redo benchmark runs. Instead of # generating a new timestamp each run by default, specify a single timestamp @@ -88,8 +89,18 @@ class Options: # CI scripts vs SYCl build source. github_repo_override: str = None git_commit_override: str = None + # EWMA Options: - EWMA_smoothing_factor: float = 0.2 + + # The smoothing factor is alpha in the EWMA equation. Generally, a higher + # smoothing factor results in newer data having more weight, and a lower + # smoothing factor results in older data having more weight. + # + # Valid values for this smoothing factor ranges from (0, 1). Recommended + # values are from 0.1 to 0.3, although this is not a law. Note that no + # value of smothing factor will result in older elements having more weight + # than newer elements. + EWMA_smoothing_factor: float = 0.15 detect_versions: DetectVersionsOptions = field( default_factory=DetectVersionsOptions From 78bd541706670aaf457fd3151aa82dc0ebd8c306 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Wed, 11 Jun 2025 12:29:31 -0700 Subject: [PATCH 10/12] adjust alpha again --- devops/scripts/benchmarks/options.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index 84bbc1301deec..7ac3f70c8da1a 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -96,11 +96,10 @@ class Options: # smoothing factor results in newer data having more weight, and a lower # smoothing factor results in older data having more weight. # - # Valid values for this smoothing factor ranges from (0, 1). Recommended - # values are from 0.1 to 0.3, although this is not a law. Note that no + # Valid values for this smoothing factor ranges from (0, 1). Note that no # value of smothing factor will result in older elements having more weight # than newer elements. - EWMA_smoothing_factor: float = 0.15 + EWMA_smoothing_factor: float = 0.1 detect_versions: DetectVersionsOptions = field( default_factory=DetectVersionsOptions From 893334a847560fb8ee1941360da69ebcc949159f Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Mon, 30 Jun 2025 12:05:31 -0700 Subject: [PATCH 11/12] Raise smoothing factor again due to recent regressions --- devops/scripts/benchmarks/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index 3beed64324a08..d442e5c1f3bb6 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -103,7 +103,7 @@ class Options: # Valid values for this smoothing factor ranges from (0, 1). Note that no # value of smothing factor will result in older elements having more weight # than newer elements. - EWMA_smoothing_factor: float = 0.1 + EWMA_smoothing_factor: float = 0.2 detect_versions: DetectVersionsOptions = field( default_factory=DetectVersionsOptions From ffba7fbb6dbd5dea6abd5712f4df74427d42e974 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Wed, 2 Jul 2025 10:02:18 -0700 Subject: [PATCH 12/12] Shrink time considered from 30 days to 7, lower smoothing factor --- devops/actions/run-tests/benchmark/action.yml | 2 +- devops/scripts/benchmarks/options.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml index 52a6d88d550cd..16b6529b11b02 100644 --- a/devops/actions/run-tests/benchmark/action.yml +++ b/devops/actions/run-tests/benchmark/action.yml @@ -161,7 +161,7 @@ runs: echo "-----" python3 ./devops/scripts/benchmarks/compare.py to_hist \ --avg-type EWMA \ - --cutoff "$(date -u -d '30 days ago' +'%Y%m%d_%H%M%S')" \ + --cutoff "$(date -u -d '7 days ago' +'%Y%m%d_%H%M%S')" \ --name "$SAVE_NAME" \ --compare-file "./llvm-ci-perf-results/results/${SAVE_NAME}_${SAVE_TIMESTAMP}.json" \ --results-dir "./llvm-ci-perf-results/results/" \ diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index d442e5c1f3bb6..3b5fed809fbb8 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -103,7 +103,7 @@ class Options: # Valid values for this smoothing factor ranges from (0, 1). Note that no # value of smothing factor will result in older elements having more weight # than newer elements. - EWMA_smoothing_factor: float = 0.2 + EWMA_smoothing_factor: float = 0.15 detect_versions: DetectVersionsOptions = field( default_factory=DetectVersionsOptions