From de610486a244608c278b40ea463a4177e69abe46 Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Wed, 4 Aug 2021 12:12:02 +0200
Subject: [PATCH 01/12] Refactor how statistics are gathered

---
 site/src/comparison.rs | 138 ++++++++++++++++++++++++-----------------
 1 file changed, 81 insertions(+), 57 deletions(-)
diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 0bf7c8b0c..9372dbeb1 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -117,13 +117,13 @@ async fn populate_report(comparison: &Comparison, report: &mut HashMap<Direction
     }
 }
 
-pub struct ComparisonSummary<'a> {
-    hi: Option<BenchmarkComparison<'a>>,
-    lo: Option<BenchmarkComparison<'a>>,
+pub struct ComparisonSummary {
+    hi: Option<BenchmarkComparison>,
+    lo: Option<BenchmarkComparison>,
 }
 
-impl ComparisonSummary<'_> {
-    pub fn summarize_comparison<'a>(comparison: &'a Comparison) -> Option<ComparisonSummary<'a>> {
+impl ComparisonSummary {
+    pub fn summarize_comparison(comparison: &Comparison) -> Option<ComparisonSummary> {
         let mut benchmarks = comparison.get_benchmarks();
         // Skip empty commits, sometimes happens if there's a compiler bug or so.
         if benchmarks.len() == 0 {
@@ -166,7 +166,7 @@ impl ComparisonSummary<'_> {
     }
 
     /// The changes ordered by their signficance (most significant first)
-    pub fn ordered_changes(&self) -> Vec<&BenchmarkComparison<'_>> {
+    pub fn ordered_changes(&self) -> Vec<&BenchmarkComparison> {
         match (&self.hi, &self.lo) {
             (None, None) => Vec::new(),
             (Some(b), None) => vec![b],
@@ -287,16 +287,14 @@ pub struct ArtifactData {
     pub artifact: ArtifactId,
     /// The pr of the artifact if known
     pub pr: Option<u32>,
-    /// Benchmark data in the form "$crate-$profile" -> Vec<("$cache", nanoseconds)>
-    ///
-    /// * $profile refers to the flavor of compilation like debug, doc, opt(timized), etc.
-    /// * $cache refers to how much of the compilation must be done and how much is cached
-    /// (e.g., "incr-unchanged" == compiling with full incremental cache and no code having changed)
-    pub data: HashMap<String, Vec<(String, f64)>>,
+    /// Statistics based on benchmark, profile and scenario
+    pub statistics: StatisticsMap,
     /// Bootstrap data in the form "$crate" -> nanoseconds
     pub bootstrap: HashMap<String, u64>,
 }
 
+type StatisticsMap = HashMap<Benchmark, HashMap<Profile, Vec<(Scenario, f64)>>>;
+
 impl ArtifactData {
     /// For the given `ArtifactId`, consume the first datapoint in each of the given `SeriesResponse`
     ///
@@ -311,7 +309,7 @@ impl ArtifactData {
     where
         T: Iterator<Item = (ArtifactId, Option<f64>)>,
     {
-        let data = data_from_series(series);
+        let statistics = statistics_from_series(series);
 
         let bootstrap = conn
             .get_bootstrap(&[conn.artifact_id(&artifact).await])
@@ -347,40 +345,55 @@ impl ArtifactData {
         Self {
             pr,
             artifact,
-            data,
+            statistics,
             bootstrap,
         }
     }
 }
 
-fn data_from_series<T>(
-    series: &mut [selector::SeriesResponse<T>],
-) -> HashMap<String, Vec<(String, f64)>>
+fn statistics_from_series<T>(series: &mut [selector::SeriesResponse<T>]) -> StatisticsMap
 where
     T: Iterator<Item = (ArtifactId, Option<f64>)>,
 {
-    let mut data = HashMap::new();
+    let mut stats: StatisticsMap = HashMap::new();
     for response in series {
         let (_, point) = response.series.next().expect("must have element");
 
-        let point = if let Some(pt) = point {
-            pt
+        let value = if let Some(v) = point {
+            v
         } else {
             continue;
         };
-        data.entry(format!(
-            "{}-{}",
-            response.path.get::<Benchmark>().unwrap(),
-            response.path.get::<Profile>().unwrap(),
-        ))
-        .or_insert_with(Vec::new)
-        .push((response.path.get::<Scenario>().unwrap().to_string(), point));
+        let benchmark = *response.path.get::<Benchmark>().unwrap();
+        let profile = *response.path.get::<Profile>().unwrap();
+        let scenario = *response.path.get::<Scenario>().unwrap();
+        stats
+            .entry(benchmark)
+            .or_default()
+            .entry(profile)
+            .or_default()
+            .push((scenario, value));
     }
-    data
+    stats
 }
 
 impl From<ArtifactData> for api::comparison::ArtifactData {
     fn from(data: ArtifactData) -> Self {
+        let stats = data
+            .statistics
+            .into_iter()
+            .flat_map(|(benchmark, profiles)| {
+                profiles.into_iter().map(move |(profile, scenarios)| {
+                    (
+                        format!("{}-{}", benchmark, profile),
+                        scenarios
+                            .into_iter()
+                            .map(|(s, v)| (s.to_string(), v))
+                            .collect::<Vec<_>>(),
+                    )
+                })
+            })
+            .collect();
         api::comparison::ArtifactData {
             commit: match data.artifact.clone() {
                 ArtifactId::Commit(c) => c.sha,
@@ -392,7 +405,7 @@ impl From<ArtifactData> for api::comparison::ArtifactData {
                 None
             },
             pr: data.pr,
-            data: data.data,
+            data: stats,
             bootstrap: data.bootstrap,
         }
     }
@@ -437,20 +450,27 @@ impl Comparison {
         next_commit(&self.b.artifact, master_commits).map(|c| c.sha.clone())
     }
 
-    fn get_benchmarks<'a>(&'a self) -> Vec<BenchmarkComparison<'a>> {
+    fn get_benchmarks(&self) -> Vec<BenchmarkComparison> {
         let mut result = Vec::new();
-        for (bench_name, a) in self.a.data.iter() {
-            if bench_name.ends_with("-doc") {
-                continue;
-            }
-            if let Some(b) = self.b.data.get(bench_name) {
-                for (cache_state, a) in a.iter() {
-                    if let Some(b) = b.iter().find(|(cs, _)| cs == cache_state).map(|(_, b)| b) {
-                        result.push(BenchmarkComparison {
-                            bench_name,
-                            cache_state,
-                            results: (a.clone(), b.clone()),
-                        })
+        for (&benchmark, profiles) in self.a.statistics.iter() {
+            for (&profile, scenarios) in profiles {
+                if profile == Profile::Doc {
+                    continue;
+                }
+
+                if let Some(b) = self.b.statistics.get(&benchmark) {
+                    if let Some(b) = b.get(&profile) {
+                        for &(scenario, a) in scenarios.iter() {
+                            if let Some(b) = b.iter().find(|(s, _)| *s == scenario).map(|(_, b)| b)
+                            {
+                                result.push(BenchmarkComparison {
+                                    benchmark,
+                                    profile,
+                                    scenario,
+                                    results: (a, *b),
+                                })
+                            }
+                        }
                     }
                 }
             }
@@ -497,13 +517,15 @@ impl BenchmarkVariances {
 
         let mut variance_data: HashMap<String, BenchmarkVariance> = HashMap::new();
         for _ in previous_commits.iter() {
-            let series_data = data_from_series(&mut previous_commit_series);
-            for (bench_and_profile, data) in series_data {
-                for (cache, val) in data {
-                    variance_data
-                        .entry(format!("{}-{}", bench_and_profile, cache))
-                        .or_default()
-                        .push(val);
+            let series_data = statistics_from_series(&mut previous_commit_series);
+            for (bench, profiles) in series_data {
+                for (profile, scenarios) in profiles {
+                    for (scenario, val) in scenarios {
+                        variance_data
+                            .entry(format!("{}-{}-{}", bench, profile, scenario))
+                            .or_default()
+                            .push(val);
+                    }
                 }
             }
         }
@@ -631,14 +653,15 @@ pub fn next_commit<'a>(
 
 // A single comparison based on benchmark and cache state
 #[derive(Debug)]
-pub struct BenchmarkComparison<'a> {
-    bench_name: &'a str,
-    cache_state: &'a str,
+pub struct BenchmarkComparison {
+    benchmark: Benchmark,
+    profile: Profile,
+    scenario: Scenario,
     results: (f64, f64),
 }
 
 const SIGNIFICANCE_THRESHOLD: f64 = 0.01;
-impl BenchmarkComparison<'_> {
+impl BenchmarkComparison {
     fn log_change(&self) -> f64 {
         let (a, b) = self.results;
         (b / a).ln()
@@ -650,9 +673,10 @@ impl BenchmarkComparison<'_> {
     }
 
     fn is_significant(&self) -> bool {
-        // This particular (benchmark, cache) combination frequently varies
-        if self.bench_name.starts_with("coercions-debug")
-            && self.cache_state == "incr-patched: println"
+        // This particular test case frequently varies
+        if &self.benchmark == "coercions"
+            && self.profile == Profile::Debug
+            && matches!(self.scenario, Scenario::IncrementalPatch(p) if &p == "println")
         {
             self.relative_change().abs() > 2.0
         } else {
@@ -703,7 +727,7 @@ impl BenchmarkComparison<'_> {
         writeln!(
             summary,
             " (up to {:.1}% on `{}` builds of `{}`)",
-            percent, self.cache_state, self.bench_name
+            percent, self.scenario, self.benchmark
         )
         .unwrap();
     }

From 39608fb7f4155339a8abbd68fe7aef06e6e3245b Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Wed, 4 Aug 2021 12:22:07 +0200
Subject: [PATCH 02/12] Group all stats by test case

---
 site/src/comparison.rs | 76 ++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 51 deletions(-)

diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 9372dbeb1..94f3d055b 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -293,7 +293,7 @@ pub struct ArtifactData {
     pub bootstrap: HashMap<String, u64>,
 }
 
-type StatisticsMap = HashMap<Benchmark, HashMap<Profile, Vec<(Scenario, f64)>>>;
+type StatisticsMap = HashMap<(Benchmark, Profile, Scenario), f64>;
 
 impl ArtifactData {
     /// For the given `ArtifactId`, consume the first datapoint in each of the given `SeriesResponse`
@@ -367,33 +367,20 @@ where
         let benchmark = *response.path.get::<Benchmark>().unwrap();
         let profile = *response.path.get::<Profile>().unwrap();
         let scenario = *response.path.get::<Scenario>().unwrap();
-        stats
-            .entry(benchmark)
-            .or_default()
-            .entry(profile)
-            .or_default()
-            .push((scenario, value));
+        stats.insert((benchmark, profile, scenario), value);
     }
     stats
 }
 
 impl From<ArtifactData> for api::comparison::ArtifactData {
     fn from(data: ArtifactData) -> Self {
-        let stats = data
-            .statistics
-            .into_iter()
-            .flat_map(|(benchmark, profiles)| {
-                profiles.into_iter().map(move |(profile, scenarios)| {
-                    (
-                        format!("{}-{}", benchmark, profile),
-                        scenarios
-                            .into_iter()
-                            .map(|(s, v)| (s.to_string(), v))
-                            .collect::<Vec<_>>(),
-                    )
-                })
-            })
-            .collect();
+        let mut stats: HashMap<String, Vec<(String, f64)>> = HashMap::new();
+        for ((benchmark, profile, scenario), value) in data.statistics {
+            stats
+                .entry(format!("{}-{}", benchmark, profile))
+                .or_default()
+                .push((scenario.to_string(), value))
+        }
         api::comparison::ArtifactData {
             commit: match data.artifact.clone() {
                 ArtifactId::Commit(c) => c.sha,
@@ -452,27 +439,18 @@ impl Comparison {
 
     fn get_benchmarks(&self) -> Vec<BenchmarkComparison> {
         let mut result = Vec::new();
-        for (&benchmark, profiles) in self.a.statistics.iter() {
-            for (&profile, scenarios) in profiles {
-                if profile == Profile::Doc {
-                    continue;
-                }
+        for (&(benchmark, profile, scenario), &a) in self.a.statistics.iter() {
+            if profile == Profile::Doc {
+                continue;
+            }
 
-                if let Some(b) = self.b.statistics.get(&benchmark) {
-                    if let Some(b) = b.get(&profile) {
-                        for &(scenario, a) in scenarios.iter() {
-                            if let Some(b) = b.iter().find(|(s, _)| *s == scenario).map(|(_, b)| b)
-                            {
-                                result.push(BenchmarkComparison {
-                                    benchmark,
-                                    profile,
-                                    scenario,
-                                    results: (a, *b),
-                                })
-                            }
-                        }
-                    }
-                }
+            if let Some(&b) = self.b.statistics.get(&(benchmark, profile, scenario)) {
+                result.push(BenchmarkComparison {
+                    benchmark,
+                    profile,
+                    scenario,
+                    results: (a, b),
+                })
             }
         }
 
@@ -518,15 +496,11 @@ impl BenchmarkVariances {
         let mut variance_data: HashMap<String, BenchmarkVariance> = HashMap::new();
         for _ in previous_commits.iter() {
             let series_data = statistics_from_series(&mut previous_commit_series);
-            for (bench, profiles) in series_data {
-                for (profile, scenarios) in profiles {
-                    for (scenario, val) in scenarios {
-                        variance_data
-                            .entry(format!("{}-{}-{}", bench, profile, scenario))
-                            .or_default()
-                            .push(val);
-                    }
-                }
+            for ((bench, profile, scenario), value) in series_data {
+                variance_data
+                    .entry(format!("{}-{}-{}", bench, profile, scenario))
+                    .or_default()
+                    .push(value);
             }
         }
         if variance_data.len() < Self::MIN_PREVIOUS_COMMITS {

From 7e3784d6f5c923a22a63e75169a61b8b38b81cfc Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Wed, 4 Aug 2021 13:33:45 +0200
Subject: [PATCH 03/12] Get comparison in order

---
 site/src/api.rs        |  24 ++++--
 site/src/comparison.rs | 167 +++++++++++++++++++++++------------------
 2 files changed, 111 insertions(+), 80 deletions(-)

diff --git a/site/src/api.rs b/site/src/api.rs
index eacf9d993..b3d27a556 100644
--- a/site/src/api.rs
+++ b/site/src/api.rs
@@ -138,7 +138,6 @@ pub mod bootstrap {
 }
 
 pub mod comparison {
-    use crate::comparison;
     use collector::Bound;
     use database::Date;
     use serde::{Deserialize, Serialize};
@@ -153,13 +152,12 @@ pub mod comparison {
 
     #[derive(Debug, Clone, Serialize)]
     pub struct Response {
-        /// The variance data for each benchmark, if any.
-        pub variance: Option<HashMap<String, comparison::BenchmarkVariance>>,
         /// The names for the previous artifact before `a`, if any.
         pub prev: Option<String>,
 
-        pub a: ArtifactData,
-        pub b: ArtifactData,
+        pub a: ArtifactDescription,
+        pub b: ArtifactDescription,
+        pub comparisons: Vec<Comparison>,
 
         /// The names for the next artifact after `b`, if any.
         pub next: Option<String>,
@@ -169,15 +167,25 @@ pub mod comparison {
         pub is_contiguous: bool,
     }
 
-    /// A serializable wrapper for `comparison::ArtifactData`.
     #[derive(Debug, Clone, Serialize)]
-    pub struct ArtifactData {
+    pub struct ArtifactDescription {
         pub commit: String,
         pub date: Option<Date>,
         pub pr: Option<u32>,
-        pub data: HashMap<String, Vec<(String, f64)>>,
         pub bootstrap: HashMap<String, u64>,
     }
+
+    /// A serializable wrapper for `comparison::ArtifactData`.
+    #[derive(Debug, Clone, Serialize)]
+    pub struct Comparison {
+        pub benchmark: String,
+        pub profile: String,
+        pub scenario: String,
+        pub is_significant: bool,
+        pub is_dodgy: bool,
+        pub historical_statistics: Option<Vec<f64>>,
+        pub statistics: (f64, f64),
+    }
 }
 
 pub mod status {
diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 94f3d055b..a37b3f89b 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -12,7 +12,7 @@ use collector::Bound;
 use log::debug;
 use serde::Serialize;
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::error::Error;
 use std::hash::Hash;
 use std::sync::Arc;
@@ -96,12 +96,29 @@ pub async fn handle_compare(
     let prev = comparison.prev(&master_commits);
     let next = comparison.next(&master_commits);
     let is_contiguous = comparison.is_contiguous(&*conn, &master_commits).await;
+    let comparisons = comparison
+        .statistics
+        .into_iter()
+        .map(|comparison| api::comparison::Comparison {
+            benchmark: comparison.benchmark.to_string(),
+            profile: comparison.profile.to_string(),
+            scenario: comparison.scenario.to_string(),
+            is_dodgy: comparison
+                .variance
+                .as_ref()
+                .map(|v| v.is_dodgy())
+                .unwrap_or(false),
+            is_significant: comparison.is_significant(),
+            historical_statistics: comparison.variance.map(|v| v.data),
+            statistics: comparison.results,
+        })
+        .collect();
 
     Ok(api::comparison::Response {
-        variance: comparison.benchmark_variances.map(|b| b.data),
         prev,
         a: comparison.a.into(),
         b: comparison.b.into(),
+        comparisons,
         next,
         is_contiguous,
     })
@@ -124,7 +141,7 @@ pub struct ComparisonSummary {
 
 impl ComparisonSummary {
     pub fn summarize_comparison(comparison: &Comparison) -> Option<ComparisonSummary> {
-        let mut benchmarks = comparison.get_benchmarks();
+        let mut benchmarks = comparison.get_benchmarks().collect::<Vec<_>>();
         // Skip empty commits, sometimes happens if there's a compiler bug or so.
         if benchmarks.len() == 0 {
             return None;
@@ -141,14 +158,14 @@ impl ComparisonSummary {
             .min_by(|&(_, b1), &(_, b2)| cmp(b1, b2))
             .filter(|(_, c)| c.is_significant() && !c.is_increase())
             .map(|(i, _)| i);
-        let lo = lo.map(|lo| benchmarks.remove(lo));
+        let lo = lo.map(|lo| benchmarks.remove(lo)).cloned();
         let hi = benchmarks
             .iter()
             .enumerate()
             .max_by(|&(_, b1), &(_, b2)| cmp(b1, b2))
             .filter(|(_, c)| c.is_significant() && c.is_increase())
             .map(|(i, _)| i);
-        let hi = hi.map(|hi| benchmarks.remove(hi));
+        let hi = hi.map(|hi| benchmarks.remove(hi)).cloned();
 
         Some(ComparisonSummary { hi, lo })
     }
@@ -250,11 +267,30 @@ async fn compare_given_commits(
     let mut responses = ctxt.statistic_series(query.clone(), aids).await?;
 
     let conn = ctxt.conn().await;
+    let statistics_for_a = statistics_from_series(&mut responses);
+    let statistics_for_b = statistics_from_series(&mut responses);
+
+    let variances = BenchmarkVariances::calculate(ctxt, a.clone(), master_commits, stat).await?;
+    let statistics = statistics_for_a
+        .into_iter()
+        .filter_map(|(test_case, a)| {
+            statistics_for_b
+                .get(&test_case)
+                .map(|&b| BenchmarkComparison {
+                    benchmark: test_case.0,
+                    profile: test_case.1,
+                    scenario: test_case.2,
+                    variance: variances
+                        .as_ref()
+                        .and_then(|v| v.data.get(&test_case).cloned()),
+                    results: (a, b),
+                })
+        })
+        .collect();
     Ok(Some(Comparison {
-        benchmark_variances: BenchmarkVariances::calculate(ctxt, a.clone(), master_commits, stat)
-            .await?,
-        a: ArtifactData::consume_one(&*conn, a.clone(), &mut responses, master_commits).await,
-        b: ArtifactData::consume_one(&*conn, b.clone(), &mut responses, master_commits).await,
+        a: ArtifactDescription::for_artifact(&*conn, a.clone(), master_commits).await,
+        b: ArtifactDescription::for_artifact(&*conn, b.clone(), master_commits).await,
+        statistics,
     }))
 }
 
@@ -280,37 +316,30 @@ fn previous_commits(
     prevs
 }
 
-/// Data associated with a specific artifact
+/// Detailed description of a specific artifact
 #[derive(Debug, Clone)]
-pub struct ArtifactData {
+pub struct ArtifactDescription {
     /// The artifact in question
     pub artifact: ArtifactId,
     /// The pr of the artifact if known
     pub pr: Option<u32>,
-    /// Statistics based on benchmark, profile and scenario
-    pub statistics: StatisticsMap,
     /// Bootstrap data in the form "$crate" -> nanoseconds
     pub bootstrap: HashMap<String, u64>,
 }
 
-type StatisticsMap = HashMap<(Benchmark, Profile, Scenario), f64>;
+type StatisticsMap = HashMap<TestCase, f64>;
+type TestCase = (Benchmark, Profile, Scenario);
 
-impl ArtifactData {
+impl ArtifactDescription {
     /// For the given `ArtifactId`, consume the first datapoint in each of the given `SeriesResponse`
     ///
     /// It is assumed that the provided `ArtifactId` matches the artifact id of the next data
     /// point for all of `SeriesResponse<T>`. If this is not true, this function will panic.
-    async fn consume_one<'a, T>(
+    async fn for_artifact(
         conn: &dyn database::Connection,
         artifact: ArtifactId,
-        series: &mut [selector::SeriesResponse<T>],
         master_commits: &[collector::MasterCommit],
-    ) -> Self
-    where
-        T: Iterator<Item = (ArtifactId, Option<f64>)>,
-    {
-        let statistics = statistics_from_series(series);
-
+    ) -> Self {
         let bootstrap = conn
             .get_bootstrap(&[conn.artifact_id(&artifact).await])
             .await;
@@ -345,7 +374,6 @@ impl ArtifactData {
         Self {
             pr,
             artifact,
-            statistics,
             bootstrap,
         }
     }
@@ -372,16 +400,9 @@ where
     stats
 }
 
-impl From<ArtifactData> for api::comparison::ArtifactData {
-    fn from(data: ArtifactData) -> Self {
-        let mut stats: HashMap<String, Vec<(String, f64)>> = HashMap::new();
-        for ((benchmark, profile, scenario), value) in data.statistics {
-            stats
-                .entry(format!("{}-{}", benchmark, profile))
-                .or_default()
-                .push((scenario.to_string(), value))
-        }
-        api::comparison::ArtifactData {
+impl From<ArtifactDescription> for api::comparison::ArtifactDescription {
+    fn from(data: ArtifactDescription) -> Self {
+        api::comparison::ArtifactDescription {
             commit: match data.artifact.clone() {
                 ArtifactId::Commit(c) => c.sha,
                 ArtifactId::Tag(t) => t,
@@ -392,7 +413,6 @@ impl From<ArtifactData> for api::comparison::ArtifactData {
                 None
             },
             pr: data.pr,
-            data: stats,
             bootstrap: data.bootstrap,
         }
     }
@@ -400,12 +420,10 @@ impl From<ArtifactData> for api::comparison::ArtifactData {
 
 // A comparison of two artifacts
 pub struct Comparison {
-    /// Data on how variable benchmarks have historically been
-    ///
-    /// Is `None` if we cannot determine historical variance
-    pub benchmark_variances: Option<BenchmarkVariances>,
-    pub a: ArtifactData,
-    pub b: ArtifactData,
+    pub a: ArtifactDescription,
+    pub b: ArtifactDescription,
+    /// Statistics based on test case
+    pub statistics: HashSet<BenchmarkComparison>,
 }
 
 impl Comparison {
@@ -437,34 +455,16 @@ impl Comparison {
         next_commit(&self.b.artifact, master_commits).map(|c| c.sha.clone())
     }
 
-    fn get_benchmarks(&self) -> Vec<BenchmarkComparison> {
-        let mut result = Vec::new();
-        for (&(benchmark, profile, scenario), &a) in self.a.statistics.iter() {
-            if profile == Profile::Doc {
-                continue;
-            }
-
-            if let Some(&b) = self.b.statistics.get(&(benchmark, profile, scenario)) {
-                result.push(BenchmarkComparison {
-                    benchmark,
-                    profile,
-                    scenario,
-                    results: (a, b),
-                })
-            }
-        }
-
-        result
+    fn get_benchmarks(&self) -> impl Iterator<Item = &BenchmarkComparison> {
+        self.statistics.iter().filter(|b| b.profile != Profile::Doc)
     }
 }
 
 /// A description of the amount of variance a certain benchmark is historically
 /// experiencing at a given point in time.
 pub struct BenchmarkVariances {
-    /// Variance data on a per benchmark basis
-    /// Key: $benchmark-$profile-$cache
-    /// Value: `BenchmarkVariance`
-    pub data: HashMap<String, BenchmarkVariance>,
+    /// Variance data on a per test case basis
+    pub data: HashMap<(Benchmark, Profile, Scenario), BenchmarkVariance>,
 }
 
 impl BenchmarkVariances {
@@ -493,21 +493,18 @@ impl BenchmarkVariances {
             .statistic_series(query, previous_commits.clone())
             .await?;
 
-        let mut variance_data: HashMap<String, BenchmarkVariance> = HashMap::new();
+        let mut variance_data: HashMap<(Benchmark, Profile, Scenario), BenchmarkVariance> =
+            HashMap::new();
         for _ in previous_commits.iter() {
-            let series_data = statistics_from_series(&mut previous_commit_series);
-            for ((bench, profile, scenario), value) in series_data {
-                variance_data
-                    .entry(format!("{}-{}-{}", bench, profile, scenario))
-                    .or_default()
-                    .push(value);
+            for (test_case, stat) in statistics_from_series(&mut previous_commit_series) {
+                variance_data.entry(test_case).or_default().push(stat);
             }
         }
         if variance_data.len() < Self::MIN_PREVIOUS_COMMITS {
             return Ok(None);
         }
 
-        for (bench, results) in variance_data.iter_mut() {
+        for ((bench, _, _), results) in variance_data.iter_mut() {
             debug!("Calculating variance for: {}", bench);
             results.calculate_description();
         }
@@ -576,6 +573,15 @@ impl BenchmarkVariance {
             self.description = BenchmarkVarianceDescription::Noisy(percent_change);
         }
     }
+
+    /// Whether we can trust this benchmark or not
+    fn is_dodgy(&self) -> bool {
+        matches!(
+            self.description,
+            BenchmarkVarianceDescription::Noisy(_)
+                | BenchmarkVarianceDescription::HighlyVariable(_)
+        )
+    }
 }
 
 #[derive(Debug, Clone, Copy, Serialize)]
@@ -626,11 +632,12 @@ pub fn next_commit<'a>(
 }
 
 // A single comparison based on benchmark and cache state
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct BenchmarkComparison {
     benchmark: Benchmark,
     profile: Profile,
     scenario: Scenario,
+    variance: Option<BenchmarkVariance>,
     results: (f64, f64),
 }
 
@@ -706,6 +713,22 @@ impl BenchmarkComparison {
         .unwrap();
     }
 }
+impl std::cmp::PartialEq for BenchmarkComparison {
+    fn eq(&self, other: &Self) -> bool {
+        self.benchmark == other.benchmark
+            && self.profile == other.profile
+            && self.scenario == other.scenario
+    }
+}
+impl std::cmp::Eq for BenchmarkComparison {}
+
+impl std::hash::Hash for BenchmarkComparison {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.benchmark.hash(state);
+        self.profile.hash(state);
+        self.scenario.hash(state);
+    }
+}
 
 // The direction of a performance change
 #[derive(PartialEq, Eq, Hash)]

From fefad06287dd4c3668f70ace2e218d6fa62ecc2c Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Wed, 4 Aug 2021 15:02:13 +0200
Subject: [PATCH 04/12] Update comparison page to injest new API data

---
 site/src/comparison.rs   |   2 +-
 site/static/compare.html | 130 +++++++++++++++++----------------------
 2 files changed, 59 insertions(+), 73 deletions(-)

diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index a37b3f89b..6f60172ea 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -641,7 +641,7 @@ pub struct BenchmarkComparison {
     results: (f64, f64),
 }
 
-const SIGNIFICANCE_THRESHOLD: f64 = 0.01;
+const SIGNIFICANCE_THRESHOLD: f64 = 0.001;
 impl BenchmarkComparison {
     fn log_change(&self) -> f64 {
         let (a, b) = self.results;
diff --git a/site/static/compare.html b/site/static/compare.html
index bd84caa62..800fd20fd 100644
--- a/site/static/compare.html
+++ b/site/static/compare.html
@@ -525,28 +525,23 @@ <h2>Comparing <span id="stat-header">{{stat}}</span> between <span id="before">{
                             return true;
                         }
                     }
-                    function toVariants(name) {
+                    function toVariants(results) {
                         let variants = [];
-                        for (let d of data.a.data[name]) {
-                            const key = d[0];
-                            const datumA = d[1];
-                            const datumB = data.b.data[name]?.find(x => x[0] == key)?.[1];
-                            if (!datumB) {
-                                continue;
-                            }
+                        for (let r of results) {
+                            const scenarioName = r.scenario;
+                            const datumA = r.statistics[0];
+                            const datumB = r.statistics[1];
+                            const isSignificant = r.is_significant;
                             let percent = (100 * (datumB - datumA) / datumA);
-                            let isDodgy = false;
-                            if (data.variance) {
-                                let variance = data.variance[name + "-" + key];
-                                isDodgy = (variance?.description?.type ?? "Normal") != "Normal";
-                            }
-                            if (shouldShowBuild(key)) {
+                            let isDodgy = r.is_dodgy;
+                            if (shouldShowBuild(scenarioName)) {
                                 variants.push({
-                                    casename: key,
+                                    casename: scenarioName,
                                     datumA,
                                     datumB,
                                     percent,
                                     isDodgy,
+                                    isSignificant
                                 });
                             }
                         }
@@ -555,25 +550,33 @@ <h2>Comparing <span id="stat-header">{{stat}}</span> between <span id="before">{
                     }
 
                     let benches =
-                        Object.keys(data.a.data).
-                            filter(n => filter.name && filter.name.trim() ? n.includes(filter.name.trim()) : true).
-                            map(name => {
-                                const variants = toVariants(name).filter(v => filter.showOnlySignificant ? isSignificant(v.percent, v.isDodgy) : true);
-                                const pcts = variants.map(field => parseFloat(field.percent));
-                                const maxPct = Math.max(...pcts).toFixed(1);
-                                const minPct = Math.min(...pcts).toFixed(1);
-                                if (variants.length > 0) {
-                                    variants[0].first = true;
-                                }
-
-                                return {
-                                    name,
-                                    variants,
-                                    maxPct,
-                                    minPct,
-                                };
-                            }).
-                            filter(b => b.variants.length > 0);
+                        data.comparisons.
+                            filter(n => filter.name && filter.name.trim() ? n.benchmark.includes(filter.name.trim()) : true).
+                            reduce((accum, next) => {
+                                accum[next.benchmark + "-" + next.profile] ||= [];
+                                accum[next.benchmark + "-" + next.profile].push(next);
+                                return accum;
+                            }, {});
+                    benches = Object.entries(benches).
+                        map(c => {
+                            const name = c[0];
+                            const comparison = c[1];
+                            const variants = toVariants(comparison).filter(v => filter.showOnlySignificant ? v.isSignificant : true);
+                            const pcts = variants.map(field => parseFloat(field.percent));
+                            const maxPct = Math.max(...pcts).toFixed(1);
+                            const minPct = Math.min(...pcts).toFixed(1);
+                            if (variants.length > 0) {
+                                variants[0].first = true;
+                            }
+
+                            return {
+                                name,
+                                variants,
+                                maxPct,
+                                minPct,
+                            };
+                        }).
+                        filter(b => b.variants.length > 0);
 
                     const largestChange = a => Math.max(Math.abs(a.minPct), Math.abs(a.maxPct));
                     benches.sort((a, b) => largestChange(b) - largestChange(a));
@@ -634,29 +637,25 @@ <h2>Comparing <span id="stat-header">{{stat}}</span> between <span id="before">{
                     return findQueryParam("stat") || "instructions:u";
                 },
                 summary() {
-                    const filtered = Object.fromEntries(this.benches.map(b => [b.name, Object.fromEntries(b.variants.map(v => [v.casename, true]))]));
+                    // Create object with each test case that is not filtered out as a key
+                    const filtered = Object.fromEntries(this.benches.flatMap(b => b.variants.map(v => [b.name + "-" + v.casename, true])));
                     const newCount = { regressions: 0, improvements: 0, unchanged: 0 }
                     let result = { all: { ...newCount }, filtered: { ...newCount } }
-                    for (let benchmarkAndProfile of Object.keys(this.data.a.data)) {
-                        for (let d of this.data.a.data[benchmarkAndProfile]) {
-                            const scenario = d[0];
-                            const datumA = d[1];
-                            const datumB = this.data.b.data[benchmarkAndProfile]?.find(x => x[0] == scenario)?.[1];
-                            if (!datumB) {
-                                continue;
-                            }
-                            let percent = 100 * ((datumB - datumA) / datumA);
-                            const isDodgy = this.isDodgy(benchmarkAndProfile, scenario);
-                            if (percent > 0 && isSignificant(percent, isDodgy)) {
-                                result.all.regressions += 1;
-                                result.filtered.regressions += filtered[benchmarkAndProfile]?.[scenario] || 0;
-                            } else if (percent < 0 && isSignificant(percent, isDodgy)) {
-                                result.all.improvements += 1;
-                                result.filtered.improvements += filtered[benchmarkAndProfile]?.[scenario] || 0;
-                            } else {
-                                result.all.unchanged += 1;
-                                result.filtered.unchanged += filtered[benchmarkAndProfile]?.[scenario] || 0;
-                            }
+                    for (let d of this.data.comparisons) {
+                        const testCase = testCaseString(d)
+                        const scenario = d.scenario;
+                        const datumA = d.statistics[0];
+                        const datumB = d.statistics[1];
+                        let percent = 100 * ((datumB - datumA) / datumA);
+                        if (percent > 0 && d.is_significant) {
+                            result.all.regressions += 1;
+                            result.filtered.regressions += filtered[testCase] || 0;
+                        } else if (percent < 0 && d.is_significant) {
+                            result.all.improvements += 1;
+                            result.filtered.improvements += filtered[testCase] || 0;
+                        } else {
+                            result.all.unchanged += 1;
+                            result.filtered.unchanged += filtered[testCase] || 0;
                         }
                     }
                     return result;
@@ -719,26 +718,9 @@ <h2>Comparing <span id="stat-header">{{stat}}</span> between <span id="before">{
                     }
                     return result;
                 },
-                isDodgy(benchmarkAndProfile, scenario) {
-                    let variance = this.data.variance;
-                    if (!variance) {
-                        return false;
-                    }
-                    variance = this.data.variance[benchmarkAndProfile + "-" + scenario];
-                    return (variance?.description?.type ?? "Normal") != "Normal";
-                },
             },
         });
 
-        function isSignificant(percent, isNoisy) {
-            const perAbs = Math.abs(percent);
-            if (isNoisy) {
-                return perAbs > 1.0;
-            } else {
-                return perAbs >= 0.2;
-            }
-        }
-
         function toggleFilters(id, toggle) {
             let styles = document.getElementById(id).style;
             let indicator = document.getElementById(toggle);
@@ -753,6 +735,10 @@ <h2>Comparing <span id="stat-header">{{stat}}</span> between <span id="before">{
         toggleFilters("filters-content", "filters-toggle-indicator");
         toggleFilters("search-content", "search-toggle-indicator");
 
+        function testCaseString(testCase) {
+            return testCase.benchmark + "-" + testCase.profile + "-" + testCase.scenario;
+        }
+
         document.getElementById("filters-toggle").onclick = (e) => {
             toggleFilters("filters-content", "filters-toggle-indicator");
         };

From a00ccefd5a398241d943bd5cab94dca164bad1bf Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Wed, 4 Aug 2021 16:22:08 +0200
Subject: [PATCH 05/12] Clarify significance threshold

---
 site/src/comparison.rs | 66 +++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 6f60172ea..5e9ff81db 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -103,11 +103,7 @@ pub async fn handle_compare(
             benchmark: comparison.benchmark.to_string(),
             profile: comparison.profile.to_string(),
             scenario: comparison.scenario.to_string(),
-            is_dodgy: comparison
-                .variance
-                .as_ref()
-                .map(|v| v.is_dodgy())
-                .unwrap_or(false),
+            is_dodgy: comparison.is_dodgy(),
             is_significant: comparison.is_significant(),
             historical_statistics: comparison.variance.map(|v| v.data),
             statistics: comparison.results,
@@ -135,8 +131,8 @@ async fn populate_report(comparison: &Comparison, report: &mut HashMap<Direction
 }
 
 pub struct ComparisonSummary {
-    hi: Option<BenchmarkComparison>,
-    lo: Option<BenchmarkComparison>,
+    hi: Option<TestResultComparison>,
+    lo: Option<TestResultComparison>,
 }
 
 impl ComparisonSummary {
@@ -147,7 +143,7 @@ impl ComparisonSummary {
             return None;
         }
 
-        let cmp = |b1: &BenchmarkComparison, b2: &BenchmarkComparison| {
+        let cmp = |b1: &TestResultComparison, b2: &TestResultComparison| {
             b1.log_change()
                 .partial_cmp(&b2.log_change())
                 .unwrap_or(std::cmp::Ordering::Equal)
@@ -183,7 +179,7 @@ impl ComparisonSummary {
     }
 
     /// The changes ordered by their signficance (most significant first)
-    pub fn ordered_changes(&self) -> Vec<&BenchmarkComparison> {
+    pub fn ordered_changes(&self) -> Vec<&TestResultComparison> {
         match (&self.hi, &self.lo) {
             (None, None) => Vec::new(),
             (Some(b), None) => vec![b],
@@ -276,7 +272,7 @@ async fn compare_given_commits(
         .filter_map(|(test_case, a)| {
             statistics_for_b
                 .get(&test_case)
-                .map(|&b| BenchmarkComparison {
+                .map(|&b| TestResultComparison {
                     benchmark: test_case.0,
                     profile: test_case.1,
                     scenario: test_case.2,
@@ -423,7 +419,7 @@ pub struct Comparison {
     pub a: ArtifactDescription,
     pub b: ArtifactDescription,
     /// Statistics based on test case
-    pub statistics: HashSet<BenchmarkComparison>,
+    pub statistics: HashSet<TestResultComparison>,
 }
 
 impl Comparison {
@@ -455,7 +451,7 @@ impl Comparison {
         next_commit(&self.b.artifact, master_commits).map(|c| c.sha.clone())
     }
 
-    fn get_benchmarks(&self) -> impl Iterator<Item = &BenchmarkComparison> {
+    fn get_benchmarks(&self) -> impl Iterator<Item = &TestResultComparison> {
         self.statistics.iter().filter(|b| b.profile != Profile::Doc)
     }
 }
@@ -631,9 +627,9 @@ pub fn next_commit<'a>(
     }
 }
 
-// A single comparison based on benchmark and cache state
+// A single comparison between two test results
 #[derive(Debug, Clone)]
-pub struct BenchmarkComparison {
+pub struct TestResultComparison {
     benchmark: Benchmark,
     profile: Profile,
     scenario: Scenario,
@@ -641,8 +637,15 @@ pub struct BenchmarkComparison {
     results: (f64, f64),
 }
 
-const SIGNIFICANCE_THRESHOLD: f64 = 0.001;
-impl BenchmarkComparison {
+impl TestResultComparison {
+    /// The amount of relative change considered significant when
+    /// the test case is not dodgy
+    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD: f64 = 0.01;
+
+    /// The amount of relative change considered significant when
+    /// the test case is dodgy
+    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY: f64 = 1.0;
+
     fn log_change(&self) -> f64 {
         let (a, b) = self.results;
         (b / a).ln()
@@ -654,15 +657,20 @@ impl BenchmarkComparison {
     }
 
     fn is_significant(&self) -> bool {
-        // This particular test case frequently varies
-        if &self.benchmark == "coercions"
-            && self.profile == Profile::Debug
-            && matches!(self.scenario, Scenario::IncrementalPatch(p) if &p == "println")
-        {
-            self.relative_change().abs() > 2.0
+        let threshold = if self.is_dodgy() {
+            Self::SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY
         } else {
-            self.log_change().abs() > SIGNIFICANCE_THRESHOLD
-        }
+            Self::SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD
+        };
+
+        self.relative_change().abs() > threshold
+    }
+
+    fn is_dodgy(&self) -> bool {
+        self.variance
+            .as_ref()
+            .map(|v| v.is_dodgy())
+            .unwrap_or(false)
     }
 
     fn relative_change(&self) -> f64 {
@@ -671,7 +679,7 @@ impl BenchmarkComparison {
     }
 
     fn direction(&self) -> Direction {
-        if self.log_change() > 0.0 {
+        if self.relative_change() > 0.0 {
             Direction::Regression
         } else {
             Direction::Improvement
@@ -680,7 +688,7 @@ impl BenchmarkComparison {
 
     pub fn summary_line(&self, summary: &mut String, link: Option<&str>) {
         use std::fmt::Write;
-        let magnitude = self.log_change().abs();
+        let magnitude = self.relative_change().abs();
         let size = if magnitude > 0.10 {
             "Very large"
         } else if magnitude > 0.05 {
@@ -713,16 +721,16 @@ impl BenchmarkComparison {
         .unwrap();
     }
 }
-impl std::cmp::PartialEq for BenchmarkComparison {
+impl std::cmp::PartialEq for TestResultComparison {
     fn eq(&self, other: &Self) -> bool {
         self.benchmark == other.benchmark
             && self.profile == other.profile
             && self.scenario == other.scenario
     }
 }
-impl std::cmp::Eq for BenchmarkComparison {}
+impl std::cmp::Eq for TestResultComparison {}
 
-impl std::hash::Hash for BenchmarkComparison {
+impl std::hash::Hash for TestResultComparison {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
         self.benchmark.hash(state);
         self.profile.hash(state);

From 933e58d450bc75f8e68c7951ceaf7a4f7bc2ff78 Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Wed, 4 Aug 2021 18:16:24 +0200
Subject: [PATCH 06/12] Add docs about comparison analysis

---
 docs/comparison-analysis.md | 45 +++++++++++++++++++++++++++++++++++++
 site/src/comparison.rs      |  2 +-
 2 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 docs/comparison-analysis.md

diff --git a/docs/comparison-analysis.md b/docs/comparison-analysis.md
new file mode 100644
index 000000000..9dd79bbcf
--- /dev/null
+++ b/docs/comparison-analysis.md
@@ -0,0 +1,45 @@
+# Comparison Analysis
+
+The following is a detailed explanation of the process undertaken to automate the analysis of test results for two artifacts of interest (artifact A and B). 
+
+This analysis can be done by hand, by using the [comparison page](https://perf.rust-lang.org/compare.html) and entering the two artifacts of interest in the form at the top.
+
+## The goal
+
+The goal of the analysis is to determine whether artifact B represents a performance improvement, regression, or mixed result from artifact A. Typically artifact B will be based on artifact A with a certain pull requests changes applied to artifact A to get artifact B, but this is not required.
+
+Performance analysis is typically used to determine whether a pull request has introduced performance regressions or improvements.
+
+## What is being compared?
+
+At the core of comparison analysis are the collection of test results for the two artifacts being compared. For each test case, the statistics for the two artifacts are compared and a relative change percentage is obtained using the following formula:
+
+```
+100 * ((statisticForArtifactB - statisticForArtifactA) / statisticForArtifactA)
+```
+
+## High-level analysis description 
+
+Analysis of the changes is performed in order to determine whether artifact B represents a performance change over artifact A. At a high level the analysis performed takes the following form:
+
+Are there 1 or more _significant_ test results that indicate performance changes. If all significant test results indicate regressions (i.e., all percent relative changes are positive), then artifact B represents a performance regression over artifact A. If all significant test results indicate improvements (i.e., all percent relative changes are negative), then artifact B represents a performance improvement over artifact B. If some significant test results indicate improvement and others indicate regressions, then the performance change is mixed.
+
+* What makes a test result significant?
+
+A test result is significant if the relative change percentage meets some threshold. What the threshold is depends of whether the test case is "dodgy" or not (see below for an examination of "dodginess"). For dodgy test cases, the threshold is set at 1%. For non-dodgy test cases, the threshold is set to 0.1%.
+
+* What makes a test case "dodgy"?
+
+A test case is "dodgy" if it shows signs of either being noisy or highly variable.
+
+To determine noise and high variability, the previous 100 test results for the test case in question are examined by calculating relative delta changes between adjacent test results. This is done with the following formula (where `testResult1` is the test result immediately proceeding `testResult2`):
+
+```
+testResult2 - testResult1 / testResult1
+```
+
+Any relative delta change that is above a threshold (currently 0.1) is considered "significant" for the purposes of dodginess detection.
+
+A highly variable test case is one where a certain percentage (currently 5%) of relative delta changes are significant. The logic being that test cases should only display significant relative delta changes a small percentage of the time.
+
+A noisy test case is one where of all the non-significant relative delta changes, the average delta change is still above some threshold (0.001). The logic being that non-significant changes should, on average, being very close to 0. If they are not close to zero, then they are noisy.
\ No newline at end of file
diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 5e9ff81db..59c0a2a6c 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -640,7 +640,7 @@ pub struct TestResultComparison {
 impl TestResultComparison {
     /// The amount of relative change considered significant when
     /// the test case is not dodgy
-    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD: f64 = 0.01;
+    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD: f64 = 0.001;
 
     /// The amount of relative change considered significant when
     /// the test case is dodgy

From 960c67f500e88b5481f6a078149666687d94baaf Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Wed, 4 Aug 2021 18:51:07 +0200
Subject: [PATCH 07/12] Introduce confidence levels

---
 docs/comparison-analysis.md |  20 +++++--
 site/src/comparison.rs      | 104 ++++++++++++++++++++++--------------
 site/src/github.rs          |   4 +-
 3 files changed, 82 insertions(+), 46 deletions(-)

diff --git a/docs/comparison-analysis.md b/docs/comparison-analysis.md
index 9dd79bbcf..ad6d091ec 100644
--- a/docs/comparison-analysis.md
+++ b/docs/comparison-analysis.md
@@ -22,13 +22,15 @@ At the core of comparison analysis are the collection of test results for the tw
 
 Analysis of the changes is performed in order to determine whether artifact B represents a performance change over artifact A. At a high level the analysis performed takes the following form:
 
-Are there 1 or more _significant_ test results that indicate performance changes. If all significant test results indicate regressions (i.e., all percent relative changes are positive), then artifact B represents a performance regression over artifact A. If all significant test results indicate improvements (i.e., all percent relative changes are negative), then artifact B represents a performance improvement over artifact B. If some significant test results indicate improvement and others indicate regressions, then the performance change is mixed.
+How many _significant_ test results indicate performance changes? If all significant test results indicate regressions (i.e., all percent relative changes are positive), then artifact B represents a performance regression over artifact A. If all significant test results indicate improvements (i.e., all percent relative changes are negative), then artifact B represents a performance improvement over artifact B. If some significant test results indicate improvement and others indicate regressions, then the performance change is mixed.
 
-* What makes a test result significant?
+Whether we actually _report_ an analysis or not depends on the context and how _confident_ we are in the summary of the results (see below for an explanation of how confidence is derived). For example, in pull request performance "try" runs, we report a performance change if we are at least confident that the results are "probably relevant", while for the triage report, we only report if the we are confident the results are "definitely relevant".
+
+### What makes a test result significant?
 
 A test result is significant if the relative change percentage meets some threshold. What the threshold is depends of whether the test case is "dodgy" or not (see below for an examination of "dodginess"). For dodgy test cases, the threshold is set at 1%. For non-dodgy test cases, the threshold is set to 0.1%.
 
-* What makes a test case "dodgy"?
+### What makes a test case "dodgy"?
 
 A test case is "dodgy" if it shows signs of either being noisy or highly variable.
 
@@ -42,4 +44,14 @@ Any relative delta change that is above a threshold (currently 0.1) is considere
 
 A highly variable test case is one where a certain percentage (currently 5%) of relative delta changes are significant. The logic being that test cases should only display significant relative delta changes a small percentage of the time.
 
-A noisy test case is one where of all the non-significant relative delta changes, the average delta change is still above some threshold (0.001). The logic being that non-significant changes should, on average, being very close to 0. If they are not close to zero, then they are noisy.
\ No newline at end of file
+A noisy test case is one where of all the non-significant relative delta changes, the average delta change is still above some threshold (0.001). The logic being that non-significant changes should, on average, being very close to 0. If they are not close to zero, then they are noisy.
+
+### How is confidence in whether a test analysis is "relevant" determined?
+
+The confidence in whether a test analysis is relevant depends on the number of significant test results. Depending on that number a confidence level is reached:
+
+* Maybe relevant: 0-3 changes 
+* Probably relevant: 4-6 changes 
+* Definitely relevant: >6 changes 
+
+Note: changes can be any combination of positive or negative changes.
\ No newline at end of file
diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 59c0a2a6c..0255aca4a 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -122,24 +122,30 @@ pub async fn handle_compare(
 
 async fn populate_report(comparison: &Comparison, report: &mut HashMap<Direction, Vec<String>>) {
     if let Some(summary) = ComparisonSummary::summarize_comparison(comparison) {
-        if let Some(direction) = summary.direction() {
-            let entry = report.entry(direction).or_default();
+        if summary.confidence().is_definitely_relevant() {
+            if let Some(direction) = summary.direction() {
+                let entry = report.entry(direction).or_default();
 
-            entry.push(summary.write(comparison).await)
+                entry.push(summary.write(comparison).await)
+            }
         }
     }
 }
 
 pub struct ComparisonSummary {
-    hi: Option<TestResultComparison>,
-    lo: Option<TestResultComparison>,
+    /// Significant comparisons
+    comparisons: Vec<TestResultComparison>,
 }
 
 impl ComparisonSummary {
     pub fn summarize_comparison(comparison: &Comparison) -> Option<ComparisonSummary> {
-        let mut benchmarks = comparison.get_benchmarks().collect::<Vec<_>>();
+        let mut comparisons = comparison
+            .get_benchmarks()
+            .filter(|c| c.is_significant())
+            .cloned()
+            .collect::<Vec<_>>();
         // Skip empty commits, sometimes happens if there's a compiler bug or so.
-        if benchmarks.len() == 0 {
+        if comparisons.len() == 0 {
             return None;
         }
 
@@ -148,27 +154,17 @@ impl ComparisonSummary {
                 .partial_cmp(&b2.log_change())
                 .unwrap_or(std::cmp::Ordering::Equal)
         };
-        let lo = benchmarks
-            .iter()
-            .enumerate()
-            .min_by(|&(_, b1), &(_, b2)| cmp(b1, b2))
-            .filter(|(_, c)| c.is_significant() && !c.is_increase())
-            .map(|(i, _)| i);
-        let lo = lo.map(|lo| benchmarks.remove(lo)).cloned();
-        let hi = benchmarks
-            .iter()
-            .enumerate()
-            .max_by(|&(_, b1), &(_, b2)| cmp(b1, b2))
-            .filter(|(_, c)| c.is_significant() && c.is_increase())
-            .map(|(i, _)| i);
-        let hi = hi.map(|hi| benchmarks.remove(hi)).cloned();
+        comparisons.sort_by(cmp);
 
-        Some(ComparisonSummary { hi, lo })
+        Some(ComparisonSummary { comparisons })
     }
 
     /// The direction of the changes
     pub fn direction(&self) -> Option<Direction> {
-        let d = match (&self.hi, &self.lo) {
+        let d = match (
+            self.largest_positive_change(),
+            &self.largest_negative_change(),
+        ) {
             (None, None) => return None,
             (Some(b), None) => b.direction(),
             (None, Some(b)) => b.direction(),
@@ -178,22 +174,31 @@ impl ComparisonSummary {
         Some(d)
     }
 
-    /// The changes ordered by their signficance (most significant first)
-    pub fn ordered_changes(&self) -> Vec<&TestResultComparison> {
-        match (&self.hi, &self.lo) {
-            (None, None) => Vec::new(),
-            (Some(b), None) => vec![b],
-            (None, Some(b)) => vec![b],
-            (Some(a), Some(b))
-                if b.log_change()
-                    .abs()
-                    .partial_cmp(&a.log_change().abs())
-                    .unwrap_or(std::cmp::Ordering::Equal)
-                    == std::cmp::Ordering::Greater =>
-            {
-                vec![b, a]
-            }
-            (Some(a), Some(b)) => vec![a, b],
+    /// The n largest changes ordered by the magnitude of their change
+    pub fn largest_changes(&self, n: usize) -> Vec<&TestResultComparison> {
+        let mut changes: Vec<&TestResultComparison> = self.comparisons.iter().take(n).collect();
+        changes.sort_by(|a, b| {
+            b.log_change()
+                .abs()
+                .partial_cmp(&a.log_change().abs())
+                .unwrap_or(std::cmp::Ordering::Equal)
+        });
+        changes
+    }
+
+    pub fn largest_positive_change(&self) -> Option<&TestResultComparison> {
+        self.comparisons.first().filter(|s| s.is_increase())
+    }
+
+    pub fn largest_negative_change(&self) -> Option<&TestResultComparison> {
+        self.comparisons.last().filter(|s| !s.is_increase())
+    }
+
+    pub fn confidence(&self) -> ComparisonConfidence {
+        match self.comparisons.len() {
+            0..=3 => ComparisonConfidence::MaybeRelevant,
+            4..=6 => ComparisonConfidence::ProbablyRelevant,
+            _ => ComparisonConfidence::DefinitelyRelevant,
         }
     }
 
@@ -213,7 +218,7 @@ impl ComparisonSummary {
         let end = &comparison.b.artifact;
         let link = &compare_link(start, end);
 
-        for change in self.ordered_changes() {
+        for change in self.largest_changes(2) {
             write!(result, "- ").unwrap();
             change.summary_line(&mut result, Some(link))
         }
@@ -221,6 +226,25 @@ impl ComparisonSummary {
     }
 }
 
+/// The amount of confidence we have that a comparison actually represents a real
+/// change in the performance characteristics.
+#[derive(Clone, Copy)]
+pub enum ComparisonConfidence {
+    MaybeRelevant,
+    ProbablyRelevant,
+    DefinitelyRelevant,
+}
+
+impl ComparisonConfidence {
+    pub fn is_definitely_relevant(self) -> bool {
+        matches!(self, Self::DefinitelyRelevant)
+    }
+
+    pub fn is_atleast_probably_relevant(self) -> bool {
+        matches!(self, Self::DefinitelyRelevant | Self::ProbablyRelevant)
+    }
+}
+
 /// Compare two bounds on a given stat
 ///
 /// Returns Ok(None) when no data for the end bound is present
diff --git a/site/src/github.rs b/site/src/github.rs
index 2cb1f6d87..08ae3fc7d 100644
--- a/site/src/github.rs
+++ b/site/src/github.rs
@@ -606,7 +606,7 @@ async fn categorize_benchmark(
     const DISAGREEMENT: &str = "If you disagree with this performance assessment, \
     please file an issue in [rust-lang/rustc-perf](https://github.com/rust-lang/rustc-perf/issues/new).";
     let (summary, direction) = match ComparisonSummary::summarize_comparison(&comparison) {
-        Some(s) if s.direction().is_some() => {
+        Some(s) if s.confidence().is_atleast_probably_relevant() => {
             let direction = s.direction().unwrap();
             (s, direction)
         }
@@ -630,7 +630,7 @@ async fn categorize_benchmark(
         "This change led to significant {} in compiler performance.\n",
         category
     );
-    for change in summary.ordered_changes() {
+    for change in summary.largest_changes(2) {
         write!(result, "- ").unwrap();
         change.summary_line(&mut result, None)
     }

From c186ad4e4ff6a627c9f42bbebb39be134a1a5602 Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Thu, 5 Aug 2021 09:52:41 +0200
Subject: [PATCH 08/12] Fix percentages

---
 site/src/comparison.rs | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 0255aca4a..8e596e206 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -545,8 +545,8 @@ impl BenchmarkVariance {
     const SIGNFICANT_DELTA_THRESHOLD: f64 = 0.01;
     /// The percentage of significant changes that we consider too high
     const SIGNFICANT_CHANGE_THRESHOLD: f64 = 5.0;
-    /// The percentage of change that constitutes noisy data
-    const NOISE_THRESHOLD: f64 = 0.1;
+    /// The ratio of change that constitutes noisy data
+    const NOISE_THRESHOLD: f64 = 0.001;
 
     fn push(&mut self, value: f64) {
         self.data.push(value);
@@ -580,17 +580,16 @@ impl BenchmarkVariance {
         );
 
         if percent_significant_changes > Self::SIGNFICANT_CHANGE_THRESHOLD {
-            self.description =
-                BenchmarkVarianceDescription::HighlyVariable(percent_significant_changes);
+            self.description = BenchmarkVarianceDescription::HighlyVariable;
             return;
         }
 
         let delta_mean =
             non_significant.iter().map(|(&d, _)| d).sum::<f64>() / (non_significant.len() as f64);
-        let percent_change = (delta_mean / results_mean) * 100.0;
-        debug!("Percent change: {:.3}%", percent_change);
-        if percent_change > Self::NOISE_THRESHOLD {
-            self.description = BenchmarkVarianceDescription::Noisy(percent_change);
+        let ratio_change = delta_mean / results_mean;
+        debug!("Ratio change: {:.3}", ratio_change);
+        if ratio_change > Self::NOISE_THRESHOLD {
+            self.description = BenchmarkVarianceDescription::Noisy;
         }
     }
 
@@ -598,8 +597,7 @@ impl BenchmarkVariance {
     fn is_dodgy(&self) -> bool {
         matches!(
             self.description,
-            BenchmarkVarianceDescription::Noisy(_)
-                | BenchmarkVarianceDescription::HighlyVariable(_)
+            BenchmarkVarianceDescription::Noisy | BenchmarkVarianceDescription::HighlyVariable
         )
     }
 }
@@ -610,14 +608,10 @@ pub enum BenchmarkVarianceDescription {
     Normal,
     /// A highly variable benchmark that produces many significant changes.
     /// This might indicate a benchmark which is very sensitive to compiler changes.
-    ///
-    /// Cotains the percentage of significant changes.
-    HighlyVariable(f64),
+    HighlyVariable,
     /// A noisy benchmark which is likely to see changes in performance simply between
     /// compiler runs.
-    ///
-    /// Contains the percent change that happens on average
-    Noisy(f64),
+    Noisy,
 }
 
 impl Default for BenchmarkVarianceDescription {
@@ -668,7 +662,7 @@ impl TestResultComparison {
 
     /// The amount of relative change considered significant when
     /// the test case is dodgy
-    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY: f64 = 1.0;
+    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY: f64 = 0.01;
 
     fn log_change(&self) -> f64 {
         let (a, b) = self.results;

From ae894118410114a5b09754116c9876d2d543a5ce Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Thu, 5 Aug 2021 10:22:03 +0200
Subject: [PATCH 09/12] Small adjustments

---
 site/src/comparison.rs | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 8e596e206..dc48d97e7 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -161,13 +161,10 @@ impl ComparisonSummary {
 
     /// The direction of the changes
     pub fn direction(&self) -> Option<Direction> {
-        let d = match (
-            self.largest_positive_change(),
-            &self.largest_negative_change(),
-        ) {
+        let d = match (self.largest_improvement(), self.largest_regression()) {
             (None, None) => return None,
-            (Some(b), None) => b.direction(),
-            (None, Some(b)) => b.direction(),
+            (Some(c), None) => c.direction(),
+            (None, Some(c)) => c.direction(),
             (Some(a), Some(b)) if a.is_increase() == b.is_increase() => a.direction(),
             _ => Direction::Mixed,
         };
@@ -186,12 +183,12 @@ impl ComparisonSummary {
         changes
     }
 
-    pub fn largest_positive_change(&self) -> Option<&TestResultComparison> {
-        self.comparisons.first().filter(|s| s.is_increase())
+    pub fn largest_improvement(&self) -> Option<&TestResultComparison> {
+        self.comparisons.iter().filter(|s| !s.is_increase()).next()
     }
 
-    pub fn largest_negative_change(&self) -> Option<&TestResultComparison> {
-        self.comparisons.last().filter(|s| !s.is_increase())
+    pub fn largest_regression(&self) -> Option<&TestResultComparison> {
+        self.comparisons.iter().filter(|s| s.is_increase()).next()
     }
 
     pub fn confidence(&self) -> ComparisonConfidence {
@@ -228,7 +225,7 @@ impl ComparisonSummary {
 
 /// The amount of confidence we have that a comparison actually represents a real
 /// change in the performance characteristics.
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Debug)]
 pub enum ComparisonConfidence {
     MaybeRelevant,
     ProbablyRelevant,
@@ -658,7 +655,7 @@ pub struct TestResultComparison {
 impl TestResultComparison {
     /// The amount of relative change considered significant when
     /// the test case is not dodgy
-    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD: f64 = 0.001;
+    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD: f64 = 0.002;
 
     /// The amount of relative change considered significant when
     /// the test case is dodgy
@@ -757,7 +754,7 @@ impl std::hash::Hash for TestResultComparison {
 }
 
 // The direction of a performance change
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub enum Direction {
     Improvement,
     Regression,

From 0af149a4388f25b7ccbd95e8025f22c095115def Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Thu, 5 Aug 2021 14:40:49 +0200
Subject: [PATCH 10/12] Take magnitude into account

---
 docs/comparison-analysis.md |  22 +++--
 site/src/comparison.rs      | 182 +++++++++++++++++++++++++++---------
 site/src/github.rs          |   2 +-
 3 files changed, 156 insertions(+), 50 deletions(-)

diff --git a/docs/comparison-analysis.md b/docs/comparison-analysis.md
index ad6d091ec..528e172c9 100644
--- a/docs/comparison-analysis.md
+++ b/docs/comparison-analysis.md
@@ -22,7 +22,16 @@ At the core of comparison analysis are the collection of test results for the tw
 
 Analysis of the changes is performed in order to determine whether artifact B represents a performance change over artifact A. At a high level the analysis performed takes the following form:
 
-How many _significant_ test results indicate performance changes? If all significant test results indicate regressions (i.e., all percent relative changes are positive), then artifact B represents a performance regression over artifact A. If all significant test results indicate improvements (i.e., all percent relative changes are negative), then artifact B represents a performance improvement over artifact B. If some significant test results indicate improvement and others indicate regressions, then the performance change is mixed.
+How many _significant_ test results indicate performance changes and what is the magnitude of the changes (i.e., how large are the changes regardless of the direction of change)? 
+
+* If there are improvements and regressions with magnitude of medium or above then the comparison is mixed.
+* If there are only either improvements or regressions then the comparison is labeled with that kind.
+* If one kind of changes are of medium or above magnitude (and the other kind are not), then the comparison is mixed if 15% or more of the total changes are the other (small magnitude) kind. For example:
+  * given 20 regressions (with at least 1 of medium magnitude) and all improvements of low magnitude, the comparison is only mixed if there are 4 or more improvements.
+  * given 5 regressions (with at least 1 of medium magnitude) and all improvements of low magnitude, the comparison is only mixed if there 1 or more improvements.
+* If both kinds of changes are all low magnitude changes, then the comparison is mixed unless 90% or more of total changes are of one kind. For example:
+  * given 20 changes of different kinds all of low magnitude, the result is mixed unless only 2 or fewer of the changes are of one kind.
+  * given 5 changes of different kinds all of low magnitude, the result is always mixed.
 
 Whether we actually _report_ an analysis or not depends on the context and how _confident_ we are in the summary of the results (see below for an explanation of how confidence is derived). For example, in pull request performance "try" runs, we report a performance change if we are at least confident that the results are "probably relevant", while for the triage report, we only report if the we are confident the results are "definitely relevant".
 
@@ -48,10 +57,9 @@ A noisy test case is one where of all the non-significant relative delta changes
 
 ### How is confidence in whether a test analysis is "relevant" determined?
 
-The confidence in whether a test analysis is relevant depends on the number of significant test results. Depending on that number a confidence level is reached:
-
-* Maybe relevant: 0-3 changes 
-* Probably relevant: 4-6 changes 
-* Definitely relevant: >6 changes 
+The confidence in whether a test analysis is relevant depends on the number of significant test results and their magnitude (how large a change is regardless of the direction of the change).
 
-Note: changes can be any combination of positive or negative changes.
\ No newline at end of file
+The actual algorithm for determining confidence may change, but in general the following rules apply:
+* Definitely relevant: any number of very large changes, a small amount of large and/or medium changes, or a large amount of small changes.
+* Probably relevant: any number of large changes, more than 1 medium change, or smaller but still substantial amount of small changes.
+* Maybe relevant: if it doesn't fit into the above two categories, it ends in this category.
diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index dc48d97e7..c2f6d17ca 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -133,7 +133,7 @@ async fn populate_report(comparison: &Comparison, report: &mut HashMap<Direction
 }
 
 pub struct ComparisonSummary {
-    /// Significant comparisons
+    /// Significant comparisons ordered by magnitude
     comparisons: Vec<TestResultComparison>,
 }
 
@@ -150,8 +150,8 @@ impl ComparisonSummary {
         }
 
         let cmp = |b1: &TestResultComparison, b2: &TestResultComparison| {
-            b1.log_change()
-                .partial_cmp(&b2.log_change())
+            b2.log_change()
+                .partial_cmp(&b1.log_change())
                 .unwrap_or(std::cmp::Ordering::Equal)
         };
         comparisons.sort_by(cmp);
@@ -161,41 +161,108 @@ impl ComparisonSummary {
 
     /// The direction of the changes
     pub fn direction(&self) -> Option<Direction> {
-        let d = match (self.largest_improvement(), self.largest_regression()) {
-            (None, None) => return None,
-            (Some(c), None) => c.direction(),
-            (None, Some(c)) => c.direction(),
-            (Some(a), Some(b)) if a.is_increase() == b.is_increase() => a.direction(),
-            _ => Direction::Mixed,
-        };
-        Some(d)
+        if self.comparisons.len() == 0 {
+            return None;
+        }
+
+        let (regressions, improvements): (Vec<&TestResultComparison>, _) =
+            self.comparisons.iter().partition(|c| c.is_regression());
+
+        if regressions.len() == 0 {
+            return Some(Direction::Improvement);
+        }
+
+        if improvements.len() == 0 {
+            return Some(Direction::Regression);
+        }
+
+        let total_num = self.comparisons.len();
+        let regressions_ratio = regressions.len() as f64 / total_num as f64;
+
+        let has_medium_and_above_regressions = regressions
+            .iter()
+            .any(|c| c.magnitude().is_medium_or_above());
+        let has_medium_and_above_improvements = improvements
+            .iter()
+            .any(|c| c.magnitude().is_medium_or_above());
+        match (
+            has_medium_and_above_improvements,
+            has_medium_and_above_regressions,
+        ) {
+            (true, true) => return Some(Direction::Mixed),
+            (true, false) => {
+                if regressions_ratio >= 0.15 {
+                    Some(Direction::Mixed)
+                } else {
+                    Some(Direction::Improvement)
+                }
+            }
+            (false, true) => {
+                if regressions_ratio < 0.85 {
+                    Some(Direction::Mixed)
+                } else {
+                    Some(Direction::Regression)
+                }
+            }
+            (false, false) => {
+                if regressions_ratio >= 0.1 && regressions_ratio <= 0.9 {
+                    Some(Direction::Mixed)
+                } else if regressions_ratio <= 0.1 {
+                    Some(Direction::Improvement)
+                } else {
+                    Some(Direction::Regression)
+                }
+            }
+        }
     }
 
-    /// The n largest changes ordered by the magnitude of their change
-    pub fn largest_changes(&self, n: usize) -> Vec<&TestResultComparison> {
-        let mut changes: Vec<&TestResultComparison> = self.comparisons.iter().take(n).collect();
-        changes.sort_by(|a, b| {
-            b.log_change()
-                .abs()
-                .partial_cmp(&a.log_change().abs())
-                .unwrap_or(std::cmp::Ordering::Equal)
-        });
-        changes
+    pub fn relevant_changes<'a>(&'a self) -> [Option<&TestResultComparison>; 2] {
+        match self.direction() {
+            Some(Direction::Improvement) => [self.largest_improvement(), None],
+            Some(Direction::Regression) => [self.largest_regression(), None],
+            Some(Direction::Mixed) => [self.largest_improvement(), self.largest_regression()],
+            None => [None, None],
+        }
     }
 
     pub fn largest_improvement(&self) -> Option<&TestResultComparison> {
-        self.comparisons.iter().filter(|s| !s.is_increase()).next()
+        self.comparisons
+            .iter()
+            .filter(|s| !s.is_regression())
+            .next()
     }
 
     pub fn largest_regression(&self) -> Option<&TestResultComparison> {
-        self.comparisons.iter().filter(|s| s.is_increase()).next()
+        self.comparisons.iter().filter(|s| s.is_regression()).next()
     }
 
     pub fn confidence(&self) -> ComparisonConfidence {
-        match self.comparisons.len() {
-            0..=3 => ComparisonConfidence::MaybeRelevant,
-            4..=6 => ComparisonConfidence::ProbablyRelevant,
-            _ => ComparisonConfidence::DefinitelyRelevant,
+        let mut num_small_changes = 0;
+        let mut num_medium_changes = 0;
+        let mut num_large_changes = 0;
+        let mut num_very_large_changes = 0;
+        for c in self.comparisons.iter() {
+            match c.magnitude() {
+                Magnitude::Small => num_small_changes += 1,
+                Magnitude::Medium => num_medium_changes += 1,
+                Magnitude::Large => num_large_changes += 1,
+                Magnitude::VeryLarge => num_very_large_changes += 1,
+            }
+        }
+
+        match (
+            num_medium_changes,
+            num_large_changes,
+            num_very_large_changes,
+        ) {
+            (_, _, vl) if vl > 0 => ComparisonConfidence::DefinitelyRelevant,
+            (m, l, _) if m + (l * 2) > 5 => ComparisonConfidence::DefinitelyRelevant,
+            (m, l, _) if m > 1 || l > 0 => ComparisonConfidence::ProbablyRelevant,
+            (m, _, _) => match m * 2 + num_small_changes {
+                0..=5 => ComparisonConfidence::MaybeRelevant,
+                6..=10 => ComparisonConfidence::ProbablyRelevant,
+                _ => ComparisonConfidence::DefinitelyRelevant,
+            },
         }
     }
 
@@ -215,7 +282,7 @@ impl ComparisonSummary {
         let end = &comparison.b.artifact;
         let link = &compare_link(start, end);
 
-        for change in self.largest_changes(2) {
+        for change in self.relevant_changes().iter().filter_map(|s| *s) {
             write!(result, "- ").unwrap();
             change.summary_line(&mut result, Some(link))
         }
@@ -666,7 +733,7 @@ impl TestResultComparison {
         (b / a).ln()
     }
 
-    fn is_increase(&self) -> bool {
+    fn is_regression(&self) -> bool {
         let (a, b) = self.results;
         b > a
     }
@@ -681,6 +748,19 @@ impl TestResultComparison {
         self.relative_change().abs() > threshold
     }
 
+    fn magnitude(&self) -> Magnitude {
+        let mag = self.relative_change().abs();
+        if mag < 0.01 {
+            Magnitude::Small
+        } else if mag < 0.04 {
+            Magnitude::Medium
+        } else if mag < 0.1 {
+            Magnitude::Large
+        } else {
+            Magnitude::VeryLarge
+        }
+    }
+
     fn is_dodgy(&self) -> bool {
         self.variance
             .as_ref()
@@ -703,24 +783,13 @@ impl TestResultComparison {
 
     pub fn summary_line(&self, summary: &mut String, link: Option<&str>) {
         use std::fmt::Write;
-        let magnitude = self.relative_change().abs();
-        let size = if magnitude > 0.10 {
-            "Very large"
-        } else if magnitude > 0.05 {
-            "Large"
-        } else if magnitude > 0.01 {
-            "Moderate"
-        } else if magnitude > 0.005 {
-            "Small"
-        } else {
-            "Very small"
-        };
+        let magnitude = self.magnitude();
 
         let percent = self.relative_change() * 100.0;
         write!(
             summary,
             "{} {} in {}",
-            size,
+            magnitude,
             self.direction(),
             match link {
                 Some(l) => format!("[instruction counts]({})", l),
@@ -736,6 +805,7 @@ impl TestResultComparison {
         .unwrap();
     }
 }
+
 impl std::cmp::PartialEq for TestResultComparison {
     fn eq(&self, other: &Self) -> bool {
         self.benchmark == other.benchmark
@@ -743,6 +813,7 @@ impl std::cmp::PartialEq for TestResultComparison {
             && self.scenario == other.scenario
     }
 }
+
 impl std::cmp::Eq for TestResultComparison {}
 
 impl std::hash::Hash for TestResultComparison {
@@ -772,6 +843,33 @@ impl std::fmt::Display for Direction {
     }
 }
 
+/// The relative size of a performance change
+#[derive(Debug)]
+enum Magnitude {
+    Small,
+    Medium,
+    Large,
+    VeryLarge,
+}
+
+impl Magnitude {
+    fn is_medium_or_above(&self) -> bool {
+        !matches!(self, Self::Small)
+    }
+}
+
+impl std::fmt::Display for Magnitude {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let s = match self {
+            Self::Small => "Small",
+            Self::Medium => "Moderate",
+            Self::Large => "Large",
+            Self::VeryLarge => "Very large",
+        };
+        f.write_str(s)
+    }
+}
+
 async fn generate_report(
     start: &Bound,
     end: &Bound,
diff --git a/site/src/github.rs b/site/src/github.rs
index 08ae3fc7d..0a59e74a4 100644
--- a/site/src/github.rs
+++ b/site/src/github.rs
@@ -630,7 +630,7 @@ async fn categorize_benchmark(
         "This change led to significant {} in compiler performance.\n",
         category
     );
-    for change in summary.largest_changes(2) {
+    for change in summary.relevant_changes().iter().filter_map(|c| *c) {
         write!(result, "- ").unwrap();
         change.summary_line(&mut result, None)
     }

From e8bd5425a5a8a922bef403583670b1c3a2d699d1 Mon Sep 17 00:00:00 2001
From: Ryan Levick <me@ryanlevick.com>
Date: Wed, 11 Aug 2021 15:39:41 +0200
Subject: [PATCH 11/12] Include probably relevant changes in triage for now

---
 site/src/comparison.rs | 90 +++++++++++++++++++++++++++---------------
 1 file changed, 59 insertions(+), 31 deletions(-)

diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index c2f6d17ca..96d2fe270 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -37,6 +37,7 @@ pub async fn handle_triage(
     let mut report = HashMap::new();
     let mut before = start.clone();
 
+    let mut num_comparisons = 0;
     loop {
         let comparison = match compare_given_commits(
             before,
@@ -56,6 +57,7 @@ pub async fn handle_triage(
                 break;
             }
         };
+        num_comparisons += 1;
         log::info!(
             "Comparing {} to {}",
             comparison.b.artifact,
@@ -77,7 +79,7 @@ pub async fn handle_triage(
     }
     let end = end.unwrap_or(next);
 
-    let report = generate_report(&start, &end, report).await;
+    let report = generate_report(&start, &end, report, num_comparisons).await;
     Ok(api::triage::Response(report))
 }
 
@@ -120,11 +122,17 @@ pub async fn handle_compare(
     })
 }
 
-async fn populate_report(comparison: &Comparison, report: &mut HashMap<Direction, Vec<String>>) {
+async fn populate_report(
+    comparison: &Comparison,
+    report: &mut HashMap<Option<Direction>, Vec<String>>,
+) {
     if let Some(summary) = ComparisonSummary::summarize_comparison(comparison) {
-        if summary.confidence().is_definitely_relevant() {
+        let confidence = summary.confidence();
+        if confidence.is_atleast_probably_relevant() {
             if let Some(direction) = summary.direction() {
-                let entry = report.entry(direction).or_default();
+                let entry = report
+                    .entry(confidence.is_definitely_relevant().then(|| direction))
+                    .or_default();
 
                 entry.push(summary.write(comparison).await)
             }
@@ -140,7 +148,7 @@ pub struct ComparisonSummary {
 impl ComparisonSummary {
     pub fn summarize_comparison(comparison: &Comparison) -> Option<ComparisonSummary> {
         let mut comparisons = comparison
-            .get_benchmarks()
+            .get_individual_comparisons()
             .filter(|c| c.is_significant())
             .cloned()
             .collect::<Vec<_>>();
@@ -150,8 +158,9 @@ impl ComparisonSummary {
         }
 
         let cmp = |b1: &TestResultComparison, b2: &TestResultComparison| {
-            b2.log_change()
-                .partial_cmp(&b1.log_change())
+            b2.relative_change()
+                .abs()
+                .partial_cmp(&b1.relative_change().abs())
                 .unwrap_or(std::cmp::Ordering::Equal)
         };
         comparisons.sort_by(cmp);
@@ -256,13 +265,15 @@ impl ComparisonSummary {
             num_very_large_changes,
         ) {
             (_, _, vl) if vl > 0 => ComparisonConfidence::DefinitelyRelevant,
-            (m, l, _) if m + (l * 2) > 5 => ComparisonConfidence::DefinitelyRelevant,
+            (m, l, _) if m + (l * 2) > 4 => ComparisonConfidence::DefinitelyRelevant,
             (m, l, _) if m > 1 || l > 0 => ComparisonConfidence::ProbablyRelevant,
-            (m, _, _) => match m * 2 + num_small_changes {
-                0..=5 => ComparisonConfidence::MaybeRelevant,
-                6..=10 => ComparisonConfidence::ProbablyRelevant,
-                _ => ComparisonConfidence::DefinitelyRelevant,
-            },
+            (m, _, _) => {
+                if (m * 2 + num_small_changes) > 10 {
+                    ComparisonConfidence::ProbablyRelevant
+                } else {
+                    ComparisonConfidence::MaybeRelevant
+                }
+            }
         }
     }
 
@@ -539,7 +550,7 @@ impl Comparison {
         next_commit(&self.b.artifact, master_commits).map(|c| c.sha.clone())
     }
 
-    fn get_benchmarks(&self) -> impl Iterator<Item = &TestResultComparison> {
+    fn get_individual_comparisons(&self) -> impl Iterator<Item = &TestResultComparison> {
         self.statistics.iter().filter(|b| b.profile != Profile::Doc)
     }
 }
@@ -726,12 +737,7 @@ impl TestResultComparison {
 
     /// The amount of relative change considered significant when
     /// the test case is dodgy
-    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY: f64 = 0.01;
-
-    fn log_change(&self) -> f64 {
-        let (a, b) = self.results;
-        (b / a).ln()
-    }
+    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY: f64 = 0.008;
 
     fn is_regression(&self) -> bool {
         let (a, b) = self.results;
@@ -739,22 +745,25 @@ impl TestResultComparison {
     }
 
     fn is_significant(&self) -> bool {
-        let threshold = if self.is_dodgy() {
+        self.relative_change().abs() > self.signifcance_threshold()
+    }
+
+    fn signifcance_threshold(&self) -> f64 {
+        if self.is_dodgy() {
             Self::SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY
         } else {
             Self::SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD
-        };
-
-        self.relative_change().abs() > threshold
+        }
     }
 
     fn magnitude(&self) -> Magnitude {
         let mag = self.relative_change().abs();
-        if mag < 0.01 {
+        let threshold = self.signifcance_threshold();
+        if mag < threshold * 3.0 {
             Magnitude::Small
-        } else if mag < 0.04 {
+        } else if mag < threshold * 10.0 {
             Magnitude::Medium
-        } else if mag < 0.1 {
+        } else if mag < threshold * 25.0 {
             Magnitude::Large
         } else {
             Magnitude::VeryLarge
@@ -873,7 +882,8 @@ impl std::fmt::Display for Magnitude {
 async fn generate_report(
     start: &Bound,
     end: &Bound,
-    mut report: HashMap<Direction, Vec<String>>,
+    mut report: HashMap<Option<Direction>, Vec<String>>,
+    num_comparisons: usize,
 ) -> String {
     fn fmt_bound(bound: &Bound) -> String {
         match bound {
@@ -884,9 +894,14 @@ async fn generate_report(
     }
     let start = fmt_bound(start);
     let end = fmt_bound(end);
-    let regressions = report.remove(&Direction::Regression).unwrap_or_default();
-    let improvements = report.remove(&Direction::Improvement).unwrap_or_default();
-    let mixed = report.remove(&Direction::Mixed).unwrap_or_default();
+    let regressions = report
+        .remove(&Some(Direction::Regression))
+        .unwrap_or_default();
+    let improvements = report
+        .remove(&Some(Direction::Improvement))
+        .unwrap_or_default();
+    let mixed = report.remove(&Some(Direction::Mixed)).unwrap_or_default();
+    let unlabeled = report.remove(&None).unwrap_or_default();
     let untriaged = match github::untriaged_perf_regressions().await {
         Ok(u) => u
             .iter()
@@ -910,6 +925,8 @@ TODO: Summary
 
 Triage done by **@???**.
 Revision range: [{first_commit}..{last_commit}](https://perf.rust-lang.org/?start={first_commit}&end={last_commit}&absolute=false&stat=instructions%3Au)
+{num_comparisons} comparisons made in total
+{num_def_relevant} definitely relevant comparisons and {num_prob_relevant} probably relevant comparisons
 
 {num_regressions} Regressions, {num_improvements} Improvements, {num_mixed} Mixed; ??? of them in rollups
 
@@ -925,6 +942,13 @@ Revision range: [{first_commit}..{last_commit}](https://perf.rust-lang.org/?star
 
 {mixed}
 
+#### Probably changed
+
+The following is a list of comparisons which *probably* represent real performance changes,
+but we're not 100% sure.
+
+{unlabeled}
+
 #### Untriaged Pull Requests
 
 {untriaged}
@@ -937,12 +961,16 @@ TODO: Nags
         date = chrono::Utc::today().format("%Y-%m-%d"),
         first_commit = start,
         last_commit = end,
+        num_comparisons = num_comparisons,
+        num_def_relevant = regressions.len() + improvements.len() + mixed.len(),
+        num_prob_relevant = unlabeled.len(),
         num_regressions = regressions.len(),
         num_improvements = improvements.len(),
         num_mixed = mixed.len(),
         regressions = regressions.join("\n\n"),
         improvements = improvements.join("\n\n"),
         mixed = mixed.join("\n\n"),
+        unlabeled = unlabeled.join("\n\n"),
         untriaged = untriaged
     )
 }

From a1a97a8be5e5739ce0c01aef9ae4b30e0669367a Mon Sep 17 00:00:00 2001
From: Mark Rousskov <mark.simulacrum@gmail.com>
Date: Wed, 11 Aug 2021 12:14:18 -0400
Subject: [PATCH 12/12] Update site/src/comparison.rs

---
 site/src/comparison.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/site/src/comparison.rs b/site/src/comparison.rs
index 96d2fe270..c5a1b25a4 100644
--- a/site/src/comparison.rs
+++ b/site/src/comparison.rs
@@ -945,7 +945,9 @@ Revision range: [{first_commit}..{last_commit}](https://perf.rust-lang.org/?star
 #### Probably changed
 
 The following is a list of comparisons which *probably* represent real performance changes,
-but we're not 100% sure.
+but we're not 100% sure. Please move things from this category into the categories
+above for changes you think *are* definitely relevant and file an issue for each so that
+we can consider how to change our heuristics.
 
 {unlabeled}