pydantic
diff --git a/‎examples/pydantic_ai_examples/evals/example_03_unit_testing.py
Lines changed: 3 additions & 1 deletion b/‎examples/pydantic_ai_examples/evals/example_03_unit_testing.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎pydantic_evals/pydantic_evals/dataset.py
Lines changed: 126 additions & 85 deletions b/‎pydantic_evals/pydantic_evals/dataset.py
Lines changed: 126 additions & 85 deletions
diff --git a/‎pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py
Lines changed: 31 additions & 17 deletions b/‎pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py
Lines changed: 31 additions & 17 deletions
@@ -29,7 +29,9 @@ def evaluate_dataset():
     report = dataset.evaluate_sync(infer_time_range)
     print(report)
 
-    assertion_pass_rate = report.averages().assertions
+    averages = report.averages()
+    assert averages is not None
+    assertion_pass_rate = averages.assertions
     assert assertion_pass_rate is not None, 'There should be at least one assertion'
     assert assertion_pass_rate > 0.9, (
         f'The assertion pass rate was {assertion_pass_rate:.1%}; it should be above 90%.'
 
@@ -20,7 +20,7 @@
 from dataclasses import dataclass, field
 from inspect import iscoroutinefunction
 from pathlib import Path
-from typing import Any, Callable, Generic, Literal, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union, cast
 
 import anyio
 import logfire_api
@@ -41,9 +41,13 @@
 from .evaluators._spec import EvaluatorSpec
 from .evaluators.common import DEFAULT_EVALUATORS
 from .evaluators.context import EvaluatorContext
+from .evaluators.evaluator import EvaluatorFailure
 from .otel import SpanTree
 from .otel._context_subtree import context_subtree
-from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate
+from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate, ReportCaseFailure
+
+if TYPE_CHECKING:
+    from tenacity import AsyncRetrying
 
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup  # pragma: lax no cover
@@ -84,6 +88,7 @@
 
 
 _REPORT_CASES_ADAPTER = TypeAdapter(list[ReportCase])
+_REPORT_CASE_FAILURES_ADAPTER = TypeAdapter(list[ReportCaseFailure])
 _REPORT_CASE_AGGREGATE_ADAPTER = TypeAdapter(ReportCaseAggregate)
 
 
@@ -171,11 +176,6 @@ def __init__(
         self.evaluators = list(evaluators)
 
 
-# TODO: Consider making one or more of the following changes to this type:
-#  * Add `task: Callable[[InputsT], Awaitable[OutputT]` as a field
-#  * Add `inputs_type`, `output_type`, etc. as kwargs on `__init__`
-#  * Rename to `Evaluation`
-# TODO: Allow `task` to be sync _or_ async
 class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', arbitrary_types_allowed=True):
     """A dataset of test [cases][pydantic_evals.Case].
 
@@ -263,6 +263,7 @@ async def evaluate(
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
+        retry: AsyncRetrying | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -292,24 +293,30 @@ async def evaluate(
 
             async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
                 async with limiter:
-                    result = await _run_task_and_evaluators(task, case, report_case_name, self.evaluators)
+                    result = await _run_task_and_evaluators(task, case, report_case_name, self.evaluators, retry)
                     if progress_bar and task_id is not None:  # pragma: no branch
                         progress_bar.update(task_id, advance=1)
                     return result
 
+            cases_and_failures = await task_group_gather(
+                [
+                    lambda case=case, i=i: _handle_case(case, case.name or f'Case {i}')
+                    for i, case in enumerate(self.cases, 1)
+                ]
+            )
             report = EvaluationReport(
                 name=name,
-                cases=await task_group_gather(
-                    [
-                        lambda case=case, i=i: _handle_case(case, case.name or f'Case {i}')
-                        for i, case in enumerate(self.cases, 1)
-                    ]
-                ),
+                cases=[x for x in cases_and_failures if isinstance(x, ReportCase)],
+                failures=[x for x in cases_and_failures if isinstance(x, ReportCaseFailure)],
             )
             # TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:
             eval_span.set_attribute('cases', _REPORT_CASES_ADAPTER.dump_python(report.cases))
+            # TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:
+            eval_span.set_attribute('failures', _REPORT_CASE_FAILURES_ADAPTER.dump_python(report.failures))
             # TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel
-            eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(report.averages()))
+            averages = report.averages()
+            if averages:
+                eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(averages))
         return report
 
     def evaluate_sync(
@@ -817,38 +824,55 @@ def record_attribute(self, name: str, value: Any) -> None:
 
 
 async def _run_task(
-    task: Callable[[InputsT], Awaitable[OutputT] | OutputT], case: Case[InputsT, OutputT, MetadataT]
+    task: Callable[[InputsT], Awaitable[OutputT] | OutputT],
+    case: Case[InputsT, OutputT, MetadataT],
+    retry: AsyncRetrying | None = None,
 ) -> EvaluatorContext[InputsT, OutputT, MetadataT]:
     """Run a task on a case and return the context for evaluators.
 
     Args:
         task: The task to run.
         case: The case to run the task on.
+        retry: The retry strategy to use.
 
     Returns:
         An EvaluatorContext containing the inputs, actual output, expected output, and metadata.
 
     Raises:
         Exception: Any exception raised by the task.
     """
-    task_run = _TaskRun()
-    if _CURRENT_TASK_RUN.get() is not None:  # pragma: no cover
-        raise RuntimeError('A task run has already been entered. Task runs should not be nested')
 
-    # Note: the current behavior is for task execution errors to just bubble up all the way and kill the evaluation.
-    # Should we handle them for the user in some way? If so, I guess we'd want to do that here.
-    token = _CURRENT_TASK_RUN.set(task_run)
-    try:
-        with _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span:
-            with context_subtree() as span_tree:
+    async def _run_once():
+        task_run_ = _TaskRun()
+        if _CURRENT_TASK_RUN.get() is not None:  # pragma: no cover
+            raise RuntimeError('A task run has already been entered. Task runs should not be nested')
+
+        token = _CURRENT_TASK_RUN.set(task_run_)
+        try:
+            with (
+                _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span,
+                context_subtree() as span_tree_,
+            ):
                 t0 = time.perf_counter()
                 if iscoroutinefunction(task):
-                    task_output = cast(OutputT, await task(case.inputs))
+                    task_output_ = cast(OutputT, await task(case.inputs))
                 else:
-                    task_output = cast(OutputT, await to_thread.run_sync(task, case.inputs))
+                    task_output_ = cast(OutputT, await to_thread.run_sync(task, case.inputs))
                 fallback_duration = time.perf_counter() - t0
-    finally:
-        _CURRENT_TASK_RUN.reset(token)
+            duration_ = _get_span_duration(task_span, fallback_duration)
+            return task_run_, task_output_, duration_, span_tree_
+        finally:
+            _CURRENT_TASK_RUN.reset(token)
+
+    async def _run_with_retries():
+        if retry:
+            async for attempt in retry:
+                with attempt:
+                    return await _run_once()
+        # Note: the following line will be unreachable if retry is not None
+        return await _run_once()
+
+    task_run, task_output, duration, span_tree = await _run_with_retries()
 
     if isinstance(span_tree, SpanTree):  # pragma: no branch
         # TODO: Question: Should we make this metric-attributes functionality more user-configurable in some way before merging?
@@ -863,6 +887,7 @@ async def _run_task(
                 if not isinstance(v, (int, float)):
                     continue
                 # TODO: Revisit this choice to strip the prefix..
+                # TODO: Use the span-tracking-of-metrics functionality to simplify this implementation
                 if k.startswith('gen_ai.usage.details.'):
                     task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
                 elif k.startswith('gen_ai.usage.'):
@@ -874,7 +899,7 @@ async def _run_task(
         metadata=case.metadata,
         expected_output=case.expected_output,
         output=task_output,
-        duration=_get_span_duration(task_span, fallback_duration),
+        duration=duration,
         _span_tree=span_tree,
         attributes=task_run.attributes,
         metrics=task_run.metrics,
@@ -886,7 +911,8 @@ async def _run_task_and_evaluators(
     case: Case[InputsT, OutputT, MetadataT],
     report_case_name: str,
     dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
-) -> ReportCase[InputsT, OutputT, MetadataT]:
+    retry: AsyncRetrying | None,
+) -> ReportCase[InputsT, OutputT, MetadataT] | ReportCaseFailure[InputsT, OutputT, MetadataT]:
     """Run a task on a case and evaluate the results.
 
     Args:
@@ -898,60 +924,75 @@ async def _run_task_and_evaluators(
     Returns:
         A ReportCase containing the evaluation results.
     """
-    with _logfire.span(
-        'case: {case_name}',
-        task_name=get_unwrapped_function_name(task),
-        case_name=report_case_name,
-        inputs=case.inputs,
-        metadata=case.metadata,
-        expected_output=case.expected_output,
-    ) as case_span:
-        t0 = time.time()
-        scoring_context = await _run_task(task, case)
-
-        case_span.set_attribute('output', scoring_context.output)
-        case_span.set_attribute('task_duration', scoring_context.duration)
-        case_span.set_attribute('metrics', scoring_context.metrics)
-        case_span.set_attribute('attributes', scoring_context.attributes)
-
-        evaluators = case.evaluators + dataset_evaluators
-        evaluator_outputs: list[EvaluationResult] = []
-        if evaluators:
-            evaluator_outputs_by_task = await task_group_gather(
-                [lambda ev=ev: run_evaluator(ev, scoring_context) for ev in evaluators]
-            )
-            evaluator_outputs += [out for outputs in evaluator_outputs_by_task for out in outputs]
-
-        assertions, scores, labels = _group_evaluator_outputs_by_type(evaluator_outputs)
-        case_span.set_attribute('assertions', _evaluation_results_adapter.dump_python(assertions))
-        case_span.set_attribute('scores', _evaluation_results_adapter.dump_python(scores))
-        case_span.set_attribute('labels', _evaluation_results_adapter.dump_python(labels))
-
-        context = case_span.context
-        if context is None:  # pragma: no cover
-            trace_id = ''
-            span_id = ''
-        else:
-            trace_id = f'{context.trace_id:032x}'
-            span_id = f'{context.span_id:016x}'
-        fallback_duration = time.time() - t0
-
-    return ReportCase[InputsT, OutputT, MetadataT](
-        name=report_case_name,
-        inputs=case.inputs,
-        metadata=case.metadata,
-        expected_output=case.expected_output,
-        output=scoring_context.output,
-        metrics=scoring_context.metrics,
-        attributes=scoring_context.attributes,
-        scores=scores,
-        labels=labels,
-        assertions=assertions,
-        task_duration=scoring_context.duration,
-        total_duration=_get_span_duration(case_span, fallback_duration),
-        trace_id=trace_id,
-        span_id=span_id,
-    )
+    trace_id = ''
+    span_id = ''
+    try:
+        with _logfire.span(
+            'case: {case_name}',
+            task_name=get_unwrapped_function_name(task),
+            case_name=report_case_name,
+            inputs=case.inputs,
+            metadata=case.metadata,
+            expected_output=case.expected_output,
+        ) as case_span:
+            context = case_span.context
+            if context is not None:  # pragma: no cover
+                trace_id = f'{context.trace_id:032x}'
+                span_id = f'{context.span_id:016x}'
+
+            t0 = time.time()
+            scoring_context = await _run_task(task, case, retry)
+
+            case_span.set_attribute('output', scoring_context.output)
+            case_span.set_attribute('task_duration', scoring_context.duration)
+            case_span.set_attribute('metrics', scoring_context.metrics)
+            case_span.set_attribute('attributes', scoring_context.attributes)
+
+            evaluators = case.evaluators + dataset_evaluators
+            evaluator_outputs: list[EvaluationResult] = []
+            evaluator_failures: list[EvaluatorFailure] = []
+            if evaluators:
+                evaluator_outputs_by_task = await task_group_gather(
+                    [lambda ev=ev: run_evaluator(ev, scoring_context) for ev in evaluators]
+                )
+                flattened = [out for outputs in evaluator_outputs_by_task for out in outputs]
+                evaluator_outputs += [o for o in flattened if not isinstance(o, EvaluatorFailure)]
+                evaluator_failures += [o for o in flattened if isinstance(o, EvaluatorFailure)]
+
+            assertions, scores, labels = _group_evaluator_outputs_by_type(evaluator_outputs)
+            case_span.set_attribute('assertions', _evaluation_results_adapter.dump_python(assertions))
+            case_span.set_attribute('scores', _evaluation_results_adapter.dump_python(scores))
+            case_span.set_attribute('labels', _evaluation_results_adapter.dump_python(labels))
+
+            fallback_duration = time.time() - t0
+
+        return ReportCase[InputsT, OutputT, MetadataT](
+            name=report_case_name,
+            inputs=case.inputs,
+            metadata=case.metadata,
+            expected_output=case.expected_output,
+            output=scoring_context.output,
+            metrics=scoring_context.metrics,
+            attributes=scoring_context.attributes,
+            scores=scores,
+            labels=labels,
+            assertions=assertions,
+            task_duration=scoring_context.duration,
+            total_duration=_get_span_duration(case_span, fallback_duration),
+            trace_id=trace_id,
+            span_id=span_id,
+            evaluator_failures=evaluator_failures,
+        )
+    except Exception as exc:
+        return ReportCaseFailure[InputsT, OutputT, MetadataT](
+            name=report_case_name,
+            inputs=case.inputs,
+            metadata=case.metadata,
+            expected_output=case.expected_output,
+            error_msg=f'{type(exc).__name__}: {exc}',
+            trace_id=trace_id,
+            span_id=span_id,
+        )
 
 
 _evaluation_results_adapter = TypeAdapter(Mapping[str, EvaluationResult])
 
@@ -10,7 +10,14 @@
 from typing_extensions import TypeVar
 
 from .context import EvaluatorContext
-from .evaluator import EvaluationReason, EvaluationResult, EvaluationScalar, Evaluator, EvaluatorOutput
+from .evaluator import (
+    EvaluationReason,
+    EvaluationResult,
+    EvaluationScalar,
+    Evaluator,
+    EvaluatorFailure,
+    EvaluatorOutput,
+)
 
 InputsT = TypeVar('InputsT', default=Any, contravariant=True)
 OutputT = TypeVar('OutputT', default=Any, contravariant=True)
@@ -19,7 +26,7 @@
 
 async def run_evaluator(
     evaluator: Evaluator[InputsT, OutputT, MetadataT], ctx: EvaluatorContext[InputsT, OutputT, MetadataT]
-) -> list[EvaluationResult]:
+) -> list[EvaluationResult] | list[EvaluatorFailure]:
     """Run an evaluator and return the results.
 
     This function runs an evaluator on the given context and processes the results into
@@ -35,22 +42,29 @@ async def run_evaluator(
     Raises:
         ValueError: If the evaluator returns a value of an invalid type.
     """
-    raw_results = await evaluator.evaluate_async(ctx)
-
     try:
-        results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results)
-    except ValidationError as e:
-        raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e
-
-    results = _convert_to_mapping(results, scalar_name=evaluator.get_default_evaluation_name())
-
-    details: list[EvaluationResult] = []
-    for name, result in results.items():
-        if not isinstance(result, EvaluationReason):
-            result = EvaluationReason(value=result)
-        details.append(EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator))
-
-    return details
+        raw_results = await evaluator.evaluate_async(ctx)
+
+        try:
+            results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results)
+        except ValidationError as e:
+            raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e
+
+        results = _convert_to_mapping(results, scalar_name=evaluator.get_default_evaluation_name())
+
+        details: list[EvaluationResult] = []
+        for name, result in results.items():
+            if not isinstance(result, EvaluationReason):
+                result = EvaluationReason(value=result)
+            details.append(EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator))
+
+        return details
+    except Exception as e:
+        return [
+            EvaluatorFailure(
+                name=evaluator.get_default_evaluation_name(), error_msg=f'{type(e).__name__}: {e}', source=evaluator
+            )
+        ]
 
 
 _EVALUATOR_OUTPUT_ADAPTER = TypeAdapter[EvaluatorOutput](EvaluatorOutput)