intel · PatKamin · Jul 3, 2025
@@ -71,6 +71,17 @@ def teardown(self):
 
     @abstractmethod
     def run(self, env_vars) -> list[Result]:
+        """Execute the benchmark with the given environment variables.
+
+        Args:
+            env_vars: Environment variables to use when running the benchmark.
+
+        Returns:
+            A list of Result objects with the benchmark results.
+
+        Raises:
+            Exception: If the benchmark fails for any reason.
+        """
         pass
 
     @staticmethod

@@ -158,11 +158,15 @@ def run(self, env_vars) -> list[Result]:
             res_list = []
             for row in reader:
                 if not row[0].startswith("#"):
+                    label = f"{self.name()} {row[0]}"
+                    # Check if the test passed
+                    if row[1] != "PASS":
+                        raise Exception(f"{label}")
+
                     res_list.append(
                         Result(
-                            label=f"{self.name()} {row[0]}",
+                            label=label,
                             value=float(row[12]) * 1000,  # convert to ms
-                            passed=(row[1] == "PASS"),
                             command=command,
                             env=env_vars,
                             unit="ms",

@@ -41,29 +41,30 @@ def run_iterations(
 ):
     for iter in range(iters):
         print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
-        bench_results = benchmark.run(env_vars)
-        if bench_results is None:
-            failures[benchmark.name()] = "benchmark produced no results!"
-            break
-
-        for bench_result in bench_results:
-            if not bench_result.passed:
-                failures[bench_result.label] = "verification failed"
-                print(f"complete ({bench_result.label}: verification failed).")
-                continue
-
-            print(
-                f"{benchmark.name()} complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})."
-            )
+        try:
+            bench_results = benchmark.run(env_vars)
+            if bench_results is None:
+                failures[benchmark.name()] = "benchmark produced no results!"
+                break
+
+            for bench_result in bench_results:
+                print(
+                    f"{benchmark.name()} complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})."
+                )
 
-            bench_result.name = bench_result.label
-            bench_result.lower_is_better = benchmark.lower_is_better()
-            bench_result.suite = benchmark.get_suite_name()
+                bench_result.name = bench_result.label
+                bench_result.lower_is_better = benchmark.lower_is_better()
+                bench_result.suite = benchmark.get_suite_name()
 
-            if bench_result.label not in results:
-                results[bench_result.label] = []
+                if bench_result.label not in results:
+                    results[bench_result.label] = []
 
-            results[bench_result.label].append(bench_result)
+                results[bench_result.label].append(bench_result)
+        except Exception as e:
+            failure_label = f"{benchmark.name()} iteration {iter}"
+            failures[failure_label] = f"verification failed: {str(e)}"
+            print(f"complete ({failure_label}: verification failed: {str(e)}).")
+            continue
 
 
 # https://www.statology.org/modified-z-score/

@@ -15,7 +15,6 @@ class Result:
     value: float
     command: list[str]
     env: dict[str, str]
-    passed: bool = True
     unit: str = ""
     # stddev can be optionally set by the benchmark,
     # if not set, it will be calculated automatically.