Skip to content

Commit 04ff853

Browse files
committed
Add new self-profile event for aggregating query hit counts
1 parent e4b9d01 commit 04ff853

File tree

2 files changed

+88
-7
lines changed

2 files changed

+88
-7
lines changed

compiler/rustc_data_structures/src/profiling.rs

Lines changed: 87 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ use std::fmt::Display;
8888
use std::intrinsics::unlikely;
8989
use std::path::Path;
9090
use std::sync::Arc;
91+
use std::sync::atomic::{AtomicU64, Ordering};
9192
use std::time::{Duration, Instant};
9293
use std::{fs, process};
9394

@@ -105,6 +106,8 @@ bitflags::bitflags! {
105106
struct EventFilter: u16 {
106107
const GENERIC_ACTIVITIES = 1 << 0;
107108
const QUERY_PROVIDERS = 1 << 1;
109+
/// Store detailed instant events, including timestamp and thread ID,
110+
/// per each query cache hit. Note that this is quite expensive.
108111
const QUERY_CACHE_HITS = 1 << 2;
109112
const QUERY_BLOCKED = 1 << 3;
110113
const INCR_CACHE_LOADS = 1 << 4;
@@ -113,16 +116,20 @@ bitflags::bitflags! {
113116
const FUNCTION_ARGS = 1 << 6;
114117
const LLVM = 1 << 7;
115118
const INCR_RESULT_HASHING = 1 << 8;
116-
const ARTIFACT_SIZES = 1 << 9;
119+
const ARTIFACT_SIZES = 1 << 9;
120+
/// Store aggregated counts of cache hits per query invocation.
121+
const QUERY_CACHE_HIT_COUNTS = 1 << 10;
117122

118123
const DEFAULT = Self::GENERIC_ACTIVITIES.bits() |
119124
Self::QUERY_PROVIDERS.bits() |
120125
Self::QUERY_BLOCKED.bits() |
121126
Self::INCR_CACHE_LOADS.bits() |
122127
Self::INCR_RESULT_HASHING.bits() |
123-
Self::ARTIFACT_SIZES.bits();
128+
Self::ARTIFACT_SIZES.bits() |
129+
Self::QUERY_CACHE_HIT_COUNTS.bits();
124130

125131
const ARGS = Self::QUERY_KEYS.bits() | Self::FUNCTION_ARGS.bits();
132+
const QUERY_CACHE_HIT_COMBINED = Self::QUERY_CACHE_HITS.bits() | Self::QUERY_CACHE_HIT_COUNTS.bits();
126133
}
127134
}
128135

@@ -134,6 +141,7 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[
134141
("generic-activity", EventFilter::GENERIC_ACTIVITIES),
135142
("query-provider", EventFilter::QUERY_PROVIDERS),
136143
("query-cache-hit", EventFilter::QUERY_CACHE_HITS),
144+
("query-cache-hit-count", EventFilter::QUERY_CACHE_HITS),
137145
("query-blocked", EventFilter::QUERY_BLOCKED),
138146
("incr-cache-load", EventFilter::INCR_CACHE_LOADS),
139147
("query-keys", EventFilter::QUERY_KEYS),
@@ -411,13 +419,24 @@ impl SelfProfilerRef {
411419
#[inline(never)]
412420
#[cold]
413421
fn cold_call(profiler_ref: &SelfProfilerRef, query_invocation_id: QueryInvocationId) {
414-
profiler_ref.instant_query_event(
415-
|profiler| profiler.query_cache_hit_event_kind,
416-
query_invocation_id,
417-
);
422+
if profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
423+
profiler_ref
424+
.profiler
425+
.as_ref()
426+
.unwrap()
427+
.increment_query_cache_hit_counters(QueryInvocationId(query_invocation_id.0));
428+
}
429+
if unlikely(profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
430+
profiler_ref.instant_query_event(
431+
|profiler| profiler.query_cache_hit_event_kind,
432+
query_invocation_id,
433+
);
434+
}
418435
}
419436

420-
if unlikely(self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
437+
// We check both kinds of query cache hit events at once, to reduce overhead in the
438+
// common case (with self-profile disabled).
439+
if unlikely(self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED)) {
421440
cold_call(self, query_invocation_id);
422441
}
423442
}
@@ -489,6 +508,30 @@ impl SelfProfilerRef {
489508
self.profiler.as_ref().map(|p| p.get_or_alloc_cached_string(s))
490509
}
491510

511+
/// Store query cache hits to the self-profile log.
512+
/// Should be called once at the end of the compilation session.
513+
///
514+
/// The cache hits are stored per **query invocation**, not **per query kind/type**.
515+
/// `analyzeme` can later deduplicate individual query labels from the QueryInvocationId event
516+
/// IDs.
517+
pub fn store_query_cache_hits(&self) {
518+
if self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
519+
let profiler = self.profiler.as_ref().unwrap();
520+
let query_hits = profiler.query_hits.read();
521+
let builder = EventIdBuilder::new(&profiler.profiler);
522+
let thread_id = get_thread_id();
523+
for (query_invocation, hit_count) in query_hits.iter().enumerate() {
524+
let event_id = builder.from_label(StringId::new_virtual(query_invocation as u64));
525+
profiler.profiler.record_integer_event(
526+
profiler.query_cache_hit_count_event_kind,
527+
event_id,
528+
thread_id,
529+
hit_count.load(Ordering::Relaxed),
530+
);
531+
}
532+
}
533+
}
534+
492535
#[inline]
493536
pub fn enabled(&self) -> bool {
494537
self.profiler.is_some()
@@ -537,13 +580,28 @@ pub struct SelfProfiler {
537580

538581
string_cache: RwLock<FxHashMap<String, StringId>>,
539582

583+
/// Recording individual query cache hits as "instant" measureme events
584+
/// is incredibly expensive. Instead of doing that, we simply aggregate
585+
/// cache hit *counts* per query invocation, and then store the final count
586+
/// of cache hits per invocation at the end of the compilation session.
587+
///
588+
/// With this approach, we don't know the individual thread IDs and timestamps
589+
/// of cache hits, but it has very little overhead on top of `-Zself-profile`.
590+
/// Recording the cache hits as individual events made compilation 3-5x slower.
591+
///
592+
/// Query invocation IDs should be monotonic integers, so we can store them in a vec,
593+
/// rather than using a hashmap.
594+
query_hits: RwLock<Vec<AtomicU64>>,
595+
540596
query_event_kind: StringId,
541597
generic_activity_event_kind: StringId,
542598
incremental_load_result_event_kind: StringId,
543599
incremental_result_hashing_event_kind: StringId,
544600
query_blocked_event_kind: StringId,
545601
query_cache_hit_event_kind: StringId,
546602
artifact_size_event_kind: StringId,
603+
/// Total cache hits per query invocation
604+
query_cache_hit_count_event_kind: StringId,
547605
}
548606

549607
impl SelfProfiler {
@@ -573,6 +631,7 @@ impl SelfProfiler {
573631
let query_blocked_event_kind = profiler.alloc_string("QueryBlocked");
574632
let query_cache_hit_event_kind = profiler.alloc_string("QueryCacheHit");
575633
let artifact_size_event_kind = profiler.alloc_string("ArtifactSize");
634+
let query_cache_hit_count_event_kind = profiler.alloc_string("QueryCacheHitCount");
576635

577636
let mut event_filter_mask = EventFilter::empty();
578637

@@ -618,6 +677,8 @@ impl SelfProfiler {
618677
query_blocked_event_kind,
619678
query_cache_hit_event_kind,
620679
artifact_size_event_kind,
680+
query_cache_hit_count_event_kind,
681+
query_hits: Default::default(),
621682
})
622683
}
623684

@@ -627,6 +688,25 @@ impl SelfProfiler {
627688
self.profiler.alloc_string(s)
628689
}
629690

691+
/// Store a cache hit of a query invocation
692+
pub fn increment_query_cache_hit_counters(&self, id: QueryInvocationId) {
693+
// Fast path: assume that the query was already encountered before, and just record
694+
// a cache hit.
695+
let mut guard = self.query_hits.upgradable_read();
696+
let query_hits = &guard;
697+
let index = id.0 as usize;
698+
if index < query_hits.len() {
699+
// We only want to increment the count, no other synchronization is required
700+
query_hits[index].fetch_add(1, Ordering::Relaxed);
701+
} else {
702+
// If not, we need to extend the query hit map to the highest observed ID
703+
guard.with_upgraded(|vec| {
704+
vec.resize_with(index + 1, || AtomicU64::new(0));
705+
vec[index] = AtomicU64::from(1);
706+
});
707+
}
708+
}
709+
630710
/// Gets a `StringId` for the given string. This method makes sure that
631711
/// any strings going through it will only be allocated once in the
632712
/// profiling data.

compiler/rustc_query_impl/src/profiling_support.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,4 +259,5 @@ pub fn alloc_self_profile_query_strings(tcx: TyCtxt<'_>) {
259259
for alloc in super::ALLOC_SELF_PROFILE_QUERY_STRINGS.iter() {
260260
alloc(tcx, &mut string_cache)
261261
}
262+
tcx.sess.prof.store_query_cache_hits();
262263
}

0 commit comments

Comments
 (0)