Skip to content

[CI][Bench] Implement exponentially weighted moving average for SYCL nightly regression CI #18766

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion devops/actions/run-tests/benchmark/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,16 @@ runs:
--preset "$PRESET" \
--timestamp-override "$SAVE_TIMESTAMP" \
--detect-version sycl,compute_runtime

echo "-----"
python3 ./devops/scripts/benchmarks/compare.py to_hist \
--avg-type EWMA \
--cutoff "$(date -u -d '7 days ago' +'%Y%m%d_%H%M%S')" \
--name "$SAVE_NAME" \
--compare-file "./llvm-ci-perf-results/results/${SAVE_NAME}_${SAVE_TIMESTAMP}.json" \
--results-dir "./llvm-ci-perf-results/results/" \
--regression-filter '^[a-z_]+_sycl '
--regression-filter '^[a-z_]+_sycl ' \
--verbose
echo "-----"

- name: Cache changes to benchmark folder for archival purposes
Expand Down
35 changes: 28 additions & 7 deletions devops/scripts/benchmarks/compare.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from utils.aggregate import Aggregator, SimpleMedian
from utils.aggregate import Aggregator, SimpleMedian, EWMA
from utils.validate import Validate
from utils.result import Result, BenchmarkRun
from options import options
Expand All @@ -13,6 +13,8 @@
from dataclasses import dataclass, asdict


verbose = False

@dataclass
class BenchmarkHistoricAverage:
"""Contains historic average information for 1 benchmark"""
Expand Down Expand Up @@ -225,6 +227,11 @@ def perf_diff_entry() -> dict:
elif halfway_round(delta, 2) < -options.regression_threshold:
regression.append(perf_diff_entry())

if verbose:
print(
f"{test.name}: expect {hist_avg[test.name].value}, got {test.value}"
)

return improvement, regression

def to_hist(
Expand Down Expand Up @@ -255,8 +262,12 @@ def to_hist(
from the average for this benchmark run.
"""

if avg_type != "median":
print("Only median is currently supported: Refusing to continue.")
if avg_type == "median":
aggregator_type = SimpleMedian
elif avg_type == "EWMA":
aggregator_type = EWMA
else:
print("Error: Unsupported avg_type f{avg_type}.")
exit(1)

try:
Expand All @@ -282,6 +293,7 @@ def to_hist(
result_dir,
compare_result.hostname,
cutoff,
aggregator=aggregator_type,
exclude=[Path(compare_file).stem],
)
return Compare.to_hist_avg(hist_avg, compare_result)
Expand Down Expand Up @@ -320,6 +332,11 @@ def to_hist(
help="Timestamp (in YYYYMMDD_HHMMSS) of oldest result to include in historic average calculation",
default="20000101_010101",
)
parser_avg.add_argument(
"--verbose",
action="store_true",
help="Increase output verbosity",
)
parser_avg.add_argument(
"--regression-filter",
type=str,
Expand All @@ -329,15 +346,19 @@ def to_hist(

args = parser.parse_args()

if args.verbose:
verbose = True
print("-- Compare.py --")

if args.operation == "to_hist":
if args.avg_type != "median":
print("Only median is currently supported: exiting.")
exit(1)
if not Validate.timestamp(args.cutoff):
raise ValueError("Timestamp must be provided as YYYYMMDD_HHMMSS.")
if args.avg_type not in ["median", "EWMA"]:
print("Only median, EWMA is currently supported: exiting.")
exit(1)

improvements, regressions = Compare.to_hist(
"median", args.name, args.compare_file, args.results_dir, args.cutoff
args.avg_type, args.name, args.compare_file, args.results_dir, args.cutoff
)

# Not all regressions are of concern: if a filter is provided, filter
Expand Down
12 changes: 12 additions & 0 deletions devops/scripts/benchmarks/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class Options:
exit_on_failure: bool = False

# Options intended for CI:

regression_threshold: float = 0.05
# It's necessary in CI to compare or redo benchmark runs. Instead of
# generating a new timestamp each run by default, specify a single timestamp
Expand All @@ -94,6 +95,17 @@ class Options:
archive_baseline_days: int = 30 # Archive Baseline_* runs after 30 days
archive_pr_days: int = 7 # Archive other (PR/dev) runs after 7 days

# EWMA Options:

# The smoothing factor is alpha in the EWMA equation. Generally, a higher
# smoothing factor results in newer data having more weight, and a lower
# smoothing factor results in older data having more weight.
#
# Valid values for this smoothing factor ranges from (0, 1). Note that no
# value of smothing factor will result in older elements having more weight
# than newer elements.
EWMA_smoothing_factor: float = 0.15

detect_versions: DetectVersionsOptions = field(
default_factory=DetectVersionsOptions
)
Expand Down
39 changes: 39 additions & 0 deletions devops/scripts/benchmarks/tests/test_aggregate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys
import os

sys.path.append(f"{os.path.dirname(__file__)}/../")
from options import options
from utils.aggregate import *


def run_testcase(aggregator: Aggregator, src: list, expected: float) -> bool:
aggr = aggregator()
for n in src:
aggr.add(n)
res = aggr.get_avg()
if res != expected:
print(f"Failed: {aggregator}, {src} -- expected {expected}, got {res}")
return False
return True


def test_EWMA():
options.EWMA_smoothing_factor = 0.5
testcases = [
([], None),
([100], 100),
([100, 100, 100, 100, 100], 100),
([100, 105, 103, 108, 107], 106.1875),
]
successes = 0
fails = 0
for t in testcases:
if not run_testcase(EWMA, *t):
fails = fails + 1
else:
successes = successes + 1
print(f"EWMA test: {successes} successes, {fails} fails.")


if __name__ == "__main__":
test_EWMA()
29 changes: 29 additions & 0 deletions devops/scripts/benchmarks/utils/aggregate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import statistics
from abc import ABC, abstractmethod

from options import options


class Aggregator(ABC):
"""
Expand Down Expand Up @@ -51,3 +53,30 @@ def add(self, n: float):

def get_avg(self) -> float:
return statistics.median(self.elements)


class EWMA(Aggregator):
"""
Exponentially weighted moving average based on all elements added to the
aggregator.
"""

def __init__(self, starting_elements: list = []):
self.elements = starting_elements

@staticmethod
def get_type() -> str:
return "EWMA"

def add(self, n: float):
self.elements.append(n)

def get_avg(self) -> float:
if len(self.elements) == 0:
return None # No elements collected, cannot provide an average

alpha = options.EWMA_smoothing_factor
ewma_t = self.elements[0]
for x_t in self.elements[1:]:
ewma_t = alpha * x_t + (1 - alpha) * ewma_t
return ewma_t
Loading