diff --git a/.sqlx/query-d896b69c6f6061b0652862e2baa958d5cee193e28ec6532bc1b1fbb98cfc3f16.json b/.sqlx/query-0431f4fe27d903ad6af26ff36df056a9009e8746f8334ae32f0c900975968532.json similarity index 56% rename from .sqlx/query-d896b69c6f6061b0652862e2baa958d5cee193e28ec6532bc1b1fbb98cfc3f16.json rename to .sqlx/query-0431f4fe27d903ad6af26ff36df056a9009e8746f8334ae32f0c900975968532.json index 7dd53aa43..6d79ada9d 100644 --- a/.sqlx/query-d896b69c6f6061b0652862e2baa958d5cee193e28ec6532bc1b1fbb98cfc3f16.json +++ b/.sqlx/query-0431f4fe27d903ad6af26ff36df056a9009e8746f8334ae32f0c900975968532.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "INSERT INTO releases (\n crate_id, version, release_time,\n dependencies, target_name, yanked,\n rustdoc_status, test_status, license, repository_url,\n homepage_url, description, description_long, readme,\n keywords, have_examples, downloads, files,\n doc_targets, is_library,\n documentation_url, default_target, features,\n repository_id, archive_storage\n )\n VALUES (\n $1, $2, $3, $4, $5, $6, $7, $8, $9,\n $10, $11, $12, $13, $14, $15, $16, $17, $18,\n $19, $20, $21, $22, $23, $24, $25\n )\n ON CONFLICT (crate_id, version) DO UPDATE\n SET release_time = $3,\n dependencies = $4,\n target_name = $5,\n yanked = $6,\n rustdoc_status = $7,\n test_status = $8,\n license = $9,\n repository_url = $10,\n homepage_url = $11,\n description = $12,\n description_long = $13,\n readme = $14,\n keywords = $15,\n have_examples = $16,\n downloads = $17,\n files = $18,\n doc_targets = $19,\n is_library = $20,\n documentation_url = $21,\n default_target = $22,\n features = $23,\n repository_id = $24,\n archive_storage = $25\n RETURNING id", + "query": "INSERT INTO releases (\n crate_id, version, release_time,\n dependencies, target_name, yanked,\n rustdoc_status, test_status, license, repository_url,\n homepage_url, description, description_long, readme,\n keywords, have_examples, downloads, files,\n doc_targets, is_library,\n documentation_url, default_target, features,\n repository_id, archive_storage, source_size\n )\n VALUES (\n $1, $2, $3, $4, $5, $6, $7, $8, $9,\n $10, $11, $12, $13, $14, $15, $16, $17, $18,\n $19, $20, $21, $22, $23, $24, $25, $26\n )\n ON CONFLICT (crate_id, version) DO UPDATE\n SET release_time = $3,\n dependencies = $4,\n target_name = $5,\n yanked = $6,\n rustdoc_status = $7,\n test_status = $8,\n license = $9,\n repository_url = $10,\n homepage_url = $11,\n description = $12,\n description_long = $13,\n readme = $14,\n keywords = $15,\n have_examples = $16,\n downloads = $17,\n files = $18,\n doc_targets = $19,\n is_library = $20,\n documentation_url = $21,\n default_target = $22,\n features = $23,\n repository_id = $24,\n archive_storage = $25,\n source_size = $26\n RETURNING id", "describe": { "columns": [ { @@ -58,12 +58,13 @@ } }, "Int4", - "Bool" + "Bool", + "Int8" ] }, "nullable": [ false ] }, - "hash": "d896b69c6f6061b0652862e2baa958d5cee193e28ec6532bc1b1fbb98cfc3f16" + "hash": "0431f4fe27d903ad6af26ff36df056a9009e8746f8334ae32f0c900975968532" } diff --git a/.sqlx/query-0da90d737b6bf2c1c1b0ab6b14c73c8363125578b1d6e30a99e70aa6bf7842c2.json b/.sqlx/query-0da90d737b6bf2c1c1b0ab6b14c73c8363125578b1d6e30a99e70aa6bf7842c2.json new file mode 100644 index 000000000..78624ea28 --- /dev/null +++ b/.sqlx/query-0da90d737b6bf2c1c1b0ab6b14c73c8363125578b1d6e30a99e70aa6bf7842c2.json @@ -0,0 +1,57 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT\n rustc_version,\n docsrs_version,\n build_status as \"build_status: BuildStatus\",\n documentation_size,\n errors\n FROM builds\n WHERE id = $1", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "rustc_version", + "type_info": "Varchar" + }, + { + "ordinal": 1, + "name": "docsrs_version", + "type_info": "Varchar" + }, + { + "ordinal": 2, + "name": "build_status: BuildStatus", + "type_info": { + "Custom": { + "name": "build_status", + "kind": { + "Enum": [ + "in_progress", + "success", + "failure" + ] + } + } + } + }, + { + "ordinal": 3, + "name": "documentation_size", + "type_info": "Int8" + }, + { + "ordinal": 4, + "name": "errors", + "type_info": "Text" + } + ], + "parameters": { + "Left": [ + "Int4" + ] + }, + "nullable": [ + true, + true, + false, + true, + true + ] + }, + "hash": "0da90d737b6bf2c1c1b0ab6b14c73c8363125578b1d6e30a99e70aa6bf7842c2" +} diff --git a/.sqlx/query-b2fa2e823f0f8e8fbd288cd0c102ad28a198463534efbefcf06f45c0f49872af.json b/.sqlx/query-c5315b9cdb9ffd0a939705f2700a73a3795cfe8e943716923ed9dfbaed3961af.json similarity index 76% rename from .sqlx/query-b2fa2e823f0f8e8fbd288cd0c102ad28a198463534efbefcf06f45c0f49872af.json rename to .sqlx/query-c5315b9cdb9ffd0a939705f2700a73a3795cfe8e943716923ed9dfbaed3961af.json index ade9b4eac..9c56a2226 100644 --- a/.sqlx/query-b2fa2e823f0f8e8fbd288cd0c102ad28a198463534efbefcf06f45c0f49872af.json +++ b/.sqlx/query-c5315b9cdb9ffd0a939705f2700a73a3795cfe8e943716923ed9dfbaed3961af.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "UPDATE builds\n SET\n rustc_version = $1,\n docsrs_version = $2,\n build_status = $3,\n build_server = $4,\n errors = $5,\n build_time = NOW()\n WHERE\n id = $6\n RETURNING rid", + "query": "UPDATE builds\n SET\n rustc_version = $1,\n docsrs_version = $2,\n build_status = $3,\n build_server = $4,\n errors = $5,\n documentation_size = $6,\n build_time = NOW()\n WHERE\n id = $7\n RETURNING rid", "describe": { "columns": [ { @@ -27,6 +27,7 @@ }, "Text", "Text", + "Int8", "Int4" ] }, @@ -34,5 +35,5 @@ false ] }, - "hash": "b2fa2e823f0f8e8fbd288cd0c102ad28a198463534efbefcf06f45c0f49872af" + "hash": "c5315b9cdb9ffd0a939705f2700a73a3795cfe8e943716923ed9dfbaed3961af" } diff --git a/.sqlx/query-c87fb1f05c8d726ab1211cf3a5d4e43ce08ac13c468ef4d90c28ab5fa8ec6ac7.json b/.sqlx/query-f011aefb83cbd26e7e83edbb3d280c554a9e2a64f0e7953c1e52939b24ccdafa.json similarity index 57% rename from .sqlx/query-c87fb1f05c8d726ab1211cf3a5d4e43ce08ac13c468ef4d90c28ab5fa8ec6ac7.json rename to .sqlx/query-f011aefb83cbd26e7e83edbb3d280c554a9e2a64f0e7953c1e52939b24ccdafa.json index 751dfa4f0..d52fe3ac5 100644 --- a/.sqlx/query-c87fb1f05c8d726ab1211cf3a5d4e43ce08ac13c468ef4d90c28ab5fa8ec6ac7.json +++ b/.sqlx/query-f011aefb83cbd26e7e83edbb3d280c554a9e2a64f0e7953c1e52939b24ccdafa.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "SELECT\n r.rustdoc_status,\n r.default_target,\n r.doc_targets,\n r.archive_storage,\n cov.total_items,\n b.id as build_id,\n b.build_status::TEXT as build_status,\n b.docsrs_version,\n b.rustc_version\n FROM\n crates as c\n INNER JOIN releases AS r ON c.id = r.crate_id\n INNER JOIN builds as b ON r.id = b.rid\n LEFT OUTER JOIN doc_coverage AS cov ON r.id = cov.release_id\n WHERE\n c.name = $1 AND\n r.version = $2", + "query": "SELECT\n r.rustdoc_status,\n r.default_target,\n r.doc_targets,\n r.archive_storage,\n r.source_size as \"source_size!\",\n cov.total_items,\n b.id as build_id,\n b.build_status::TEXT as build_status,\n b.docsrs_version,\n b.rustc_version,\n b.documentation_size\n FROM\n crates as c\n INNER JOIN releases AS r ON c.id = r.crate_id\n INNER JOIN builds as b ON r.id = b.rid\n LEFT OUTER JOIN doc_coverage AS cov ON r.id = cov.release_id\n WHERE\n c.name = $1 AND\n r.version = $2", "describe": { "columns": [ { @@ -25,28 +25,38 @@ }, { "ordinal": 4, + "name": "source_size!", + "type_info": "Int8" + }, + { + "ordinal": 5, "name": "total_items", "type_info": "Int4" }, { - "ordinal": 5, + "ordinal": 6, "name": "build_id", "type_info": "Int4" }, { - "ordinal": 6, + "ordinal": 7, "name": "build_status", "type_info": "Text" }, { - "ordinal": 7, + "ordinal": 8, "name": "docsrs_version", "type_info": "Varchar" }, { - "ordinal": 8, + "ordinal": 9, "name": "rustc_version", "type_info": "Varchar" + }, + { + "ordinal": 10, + "name": "documentation_size", + "type_info": "Int8" } ], "parameters": { @@ -61,11 +71,13 @@ true, false, true, + true, false, null, true, + true, true ] }, - "hash": "c87fb1f05c8d726ab1211cf3a5d4e43ce08ac13c468ef4d90c28ab5fa8ec6ac7" + "hash": "f011aefb83cbd26e7e83edbb3d280c554a9e2a64f0e7953c1e52939b24ccdafa" } diff --git a/migrations/20241018031600_documentation_size.down.sql b/migrations/20241018031600_documentation_size.down.sql new file mode 100644 index 000000000..8ca23b4ae --- /dev/null +++ b/migrations/20241018031600_documentation_size.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE builds DROP COLUMN documentation_size; +ALTER TABLE releases DROP COLUMN source_size; diff --git a/migrations/20241018031600_documentation_size.up.sql b/migrations/20241018031600_documentation_size.up.sql new file mode 100644 index 000000000..b6a05630e --- /dev/null +++ b/migrations/20241018031600_documentation_size.up.sql @@ -0,0 +1,2 @@ +ALTER TABLE builds ADD COLUMN documentation_size BIGINT; +ALTER TABLE releases ADD COLUMN source_size BIGINT; diff --git a/src/db/add_package.rs b/src/db/add_package.rs index 027691d8c..df8b63465 100644 --- a/src/db/add_package.rs +++ b/src/db/add_package.rs @@ -26,7 +26,7 @@ use tracing::{debug, info, instrument}; /// NOTE: `source_files` refers to the files originally in the crate, /// not the files generated by rustdoc. #[allow(clippy::too_many_arguments)] -#[instrument(skip(conn))] +#[instrument(skip(conn, compression_algorithms))] pub(crate) async fn add_package_into_database( conn: &mut sqlx::PgConnection, metadata_pkg: &MetadataPackage, @@ -37,9 +37,10 @@ pub(crate) async fn add_package_into_database( registry_data: &ReleaseData, has_docs: bool, has_examples: bool, - compression_algorithms: std::collections::HashSet, + compression_algorithms: impl IntoIterator, repository_id: Option, archive_storage: bool, + source_size: u64, ) -> Result { debug!("Adding package into database"); let crate_id = initialize_crate(conn, &metadata_pkg.name).await?; @@ -58,12 +59,12 @@ pub(crate) async fn add_package_into_database( keywords, have_examples, downloads, files, doc_targets, is_library, documentation_url, default_target, features, - repository_id, archive_storage + repository_id, archive_storage, source_size ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, - $19, $20, $21, $22, $23, $24, $25 + $19, $20, $21, $22, $23, $24, $25, $26 ) ON CONFLICT (crate_id, version) DO UPDATE SET release_time = $3, @@ -88,7 +89,8 @@ pub(crate) async fn add_package_into_database( default_target = $22, features = $23, repository_id = $24, - archive_storage = $25 + archive_storage = $25, + source_size = $26 RETURNING id", crate_id, &metadata_pkg.version, @@ -114,7 +116,8 @@ pub(crate) async fn add_package_into_database( default_target, features as Vec, repository_id, - archive_storage + archive_storage, + source_size as i64, ) .fetch_one(&mut *conn) .await?; @@ -239,6 +242,7 @@ pub(crate) async fn finish_build( rustc_version: &str, docsrs_version: &str, build_status: BuildStatus, + documentation_size: Option, errors: Option<&str>, ) -> Result<()> { debug!("updating build after finishing"); @@ -252,15 +256,17 @@ pub(crate) async fn finish_build( build_status = $3, build_server = $4, errors = $5, + documentation_size = $6, build_time = NOW() WHERE - id = $6 + id = $7 RETURNING rid", rustc_version, docsrs_version, build_status as BuildStatus, hostname.to_str().unwrap_or(""), errors, + documentation_size.map(|v| v as i64), build_id, ) .fetch_one(&mut *conn) @@ -654,6 +660,7 @@ mod test { "rustc_version", "docsrs_version", BuildStatus::Success, + Some(42), None, ) .await?; @@ -663,6 +670,7 @@ mod test { rustc_version, docsrs_version, build_status as "build_status: BuildStatus", + documentation_size, errors FROM builds WHERE id = $1"#, @@ -674,6 +682,7 @@ mod test { assert_eq!(row.rustc_version, Some("rustc_version".into())); assert_eq!(row.docsrs_version, Some("docsrs_version".into())); assert_eq!(row.build_status, BuildStatus::Success); + assert_eq!(row.documentation_size, Some(42)); assert!(row.errors.is_none()); Ok(()) @@ -694,6 +703,7 @@ mod test { "rustc_version", "docsrs_version", BuildStatus::Failure, + None, Some("error message"), ) .await?; @@ -703,6 +713,7 @@ mod test { rustc_version, docsrs_version, build_status as "build_status: BuildStatus", + documentation_size, errors FROM builds WHERE id = $1"#, @@ -715,6 +726,7 @@ mod test { assert_eq!(row.docsrs_version, Some("docsrs_version".into())); assert_eq!(row.build_status, BuildStatus::Failure); assert_eq!(row.errors, Some("error message".into())); + assert!(row.documentation_size.is_none()); Ok(()) }) diff --git a/src/db/file.rs b/src/db/file.rs index ad6a91175..f1adaa9d7 100644 --- a/src/db/file.rs +++ b/src/db/file.rs @@ -8,11 +8,49 @@ //! However, postgres is still available for testing and backwards compatibility. use crate::error::Result; -use crate::storage::{AsyncStorage, CompressionAlgorithm, CompressionAlgorithms}; +use crate::storage::{AsyncStorage, CompressionAlgorithm}; +use mime::Mime; use serde_json::Value; +use std::ffi::OsStr; use std::path::{Path, PathBuf}; use tracing::instrument; +/// represents a file path from our source or documentation builds. +/// Used to return metadata about the file. +#[derive(Debug)] +pub struct FileEntry { + pub(crate) path: PathBuf, + pub(crate) size: u64, +} + +impl FileEntry { + pub(crate) fn mime(&self) -> Mime { + detect_mime(&self.path).parse().unwrap() + } +} + +pub(crate) fn detect_mime(file_path: impl AsRef) -> &'static str { + let mime = mime_guess::from_path(file_path.as_ref()) + .first_raw() + .unwrap_or("text/plain"); + match mime { + "text/plain" | "text/troff" | "text/x-markdown" | "text/x-rust" | "text/x-toml" => { + match file_path.as_ref().extension().and_then(OsStr::to_str) { + Some("md") => "text/markdown", + Some("rs") => "text/rust", + Some("markdown") => "text/markdown", + Some("css") => "text/css", + Some("toml") => "text/toml", + Some("js") => "application/javascript", + Some("json") => "application/json", + _ => mime, + } + } + "image/svg" => "image/svg+xml", + _ => mime, + } +} + /// Store all files in a directory and return [[mimetype, filename]] as Json /// /// If there is an S3 Client configured, store files into an S3 bucket; @@ -26,12 +64,8 @@ pub async fn add_path_into_database>( storage: &AsyncStorage, prefix: impl AsRef, path: P, -) -> Result<(Value, CompressionAlgorithms)> { - let (file_list, algorithms) = storage.store_all(prefix.as_ref(), path.as_ref()).await?; - Ok(( - file_list_to_json(file_list.into_iter().collect()), - algorithms, - )) +) -> Result<(Vec, CompressionAlgorithm)> { + storage.store_all(prefix.as_ref(), path.as_ref()).await } #[instrument(skip(storage))] @@ -40,27 +74,25 @@ pub async fn add_path_into_remote_archive + std::fmt::Debug>( archive_path: &str, path: P, public_access: bool, -) -> Result<(Value, CompressionAlgorithm)> { +) -> Result<(Vec, CompressionAlgorithm)> { let (file_list, algorithm) = storage .store_all_in_archive(archive_path, path.as_ref()) .await?; if public_access { storage.set_public_access(archive_path, true).await?; } - Ok(( - file_list_to_json(file_list.into_iter().collect()), - algorithm, - )) + Ok((file_list, algorithm)) } -fn file_list_to_json(file_list: Vec<(PathBuf, String)>) -> Value { +pub(crate) fn file_list_to_json(files: impl IntoIterator) -> Value { Value::Array( - file_list + files .into_iter() - .map(|(path, name)| { + .map(|info| { Value::Array(vec![ - Value::String(name), - Value::String(path.into_os_string().into_string().unwrap()), + Value::String(info.mime().as_ref().to_string()), + Value::String(info.path.into_os_string().into_string().unwrap()), + Value::Number(info.size.into()), ]) }) .collect(), diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 055e0ae33..b20a8f662 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -1,4 +1,4 @@ -use crate::db::file::add_path_into_database; +use crate::db::file::{add_path_into_database, file_list_to_json}; use crate::db::{ add_doc_coverage, add_package_into_database, add_path_into_remote_archive, finish_build, initialize_build, initialize_crate, initialize_release, types::BuildStatus, @@ -476,6 +476,7 @@ impl RustwideBuilder { algs.insert(new_alg); files_list }; + let source_size: u64 = files_list.iter().map(|info| info.size).sum(); let metadata = Metadata::from_crate_root(build.host_source_dir())?; let BuildTargets { default_target, @@ -532,7 +533,7 @@ impl RustwideBuilder { } let mut target_build_logs = HashMap::new(); - if has_docs { + let documentation_size = if has_docs { debug!("adding documentation for the default target to the database"); self.copy_docs( &build.host_target_dir(), @@ -557,13 +558,21 @@ impl RustwideBuilder { )?; target_build_logs.insert(target, target_res.build_log); } - let (_, new_alg) = self.runtime.block_on(add_path_into_remote_archive( - &self.async_storage, - &rustdoc_archive_path(name, version), - local_storage.path(), - true, - ))?; + let (file_list, new_alg) = + self.runtime.block_on(add_path_into_remote_archive( + &self.async_storage, + &rustdoc_archive_path(name, version), + local_storage.path(), + true, + ))?; + let documentation_size = file_list.iter().map(|info| info.size).sum::(); + self.metrics + .documentation_size + .observe(documentation_size as f64 / 1024.0 / 1024.0); algs.insert(new_alg); + Some(documentation_size) + } else { + None }; let has_examples = build.host_source_dir().join("examples").is_dir(); @@ -603,7 +612,7 @@ impl RustwideBuilder { cargo_metadata, &build.host_source_dir(), &res.target, - files_list, + file_list_to_json(files_list), successful_targets, &release_data, has_docs, @@ -611,6 +620,7 @@ impl RustwideBuilder { algs, repository, true, + source_size, ))?; if let Some(doc_coverage) = res.doc_coverage { @@ -632,6 +642,7 @@ impl RustwideBuilder { &res.result.rustc_version, &res.result.docsrs_version, build_status, + documentation_size, None, ))?; @@ -1093,11 +1104,13 @@ mod tests { r.default_target, r.doc_targets, r.archive_storage, + r.source_size as "source_size!", cov.total_items, b.id as build_id, b.build_status::TEXT as build_status, b.docsrs_version, - b.rustc_version + b.rustc_version, + b.documentation_size FROM crates as c INNER JOIN releases AS r ON c.id = r.crate_id @@ -1120,6 +1133,8 @@ mod tests { assert!(!row.docsrs_version.unwrap().is_empty()); assert!(!row.rustc_version.unwrap().is_empty()); assert_eq!(row.build_status.unwrap(), "success"); + assert!(row.source_size > 0); + assert!(row.documentation_size.unwrap() > 0); let mut targets: Vec = row .doc_targets diff --git a/src/metrics/macros.rs b/src/metrics/macros.rs index 479300034..81c149f8b 100644 --- a/src/metrics/macros.rs +++ b/src/metrics/macros.rs @@ -23,6 +23,7 @@ macro_rules! metrics { pub(crate) cdn_invalidation_time: prometheus::HistogramVec, pub(crate) cdn_queue_time: prometheus::HistogramVec, pub(crate) build_time: prometheus::Histogram, + pub(crate) documentation_size: prometheus::Histogram, } impl $name { $vis fn new() -> Result { @@ -72,12 +73,23 @@ macro_rules! metrics { )?; registry.register(Box::new(build_time.clone()))?; + let documentation_size= prometheus::Histogram::with_opts( + prometheus::HistogramOpts::new( + "documentation_size", + "size of the documentation in MB" + ) + .namespace($namespace) + .buckets($crate::metrics::DOCUMENTATION_SIZE_BUCKETS.to_vec()) + )?; + registry.register(Box::new(documentation_size.clone()))?; + Ok(Self { registry, recently_accessed_releases: RecentlyAccessedReleases::new(), cdn_invalidation_time, cdn_queue_time, build_time, + documentation_size, $( $(#[$meta])* $metric, diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 50e52f236..5035b4bc4 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -39,6 +39,18 @@ pub const CDN_INVALIDATION_HISTOGRAM_BUCKETS: &[f64; 11] = &[ 24000.0, // 400 ]; +/// buckets for documentation size, in MiB +/// Base for some estimates: +/// * `itertools` docs is an 8.2 MB archive with 144 MB of docs +/// * the biggest doc archive know of (`stm32ral`) is an 1.8 GiB archive, +/// which would be an estimated 32 GiB of docs based on the compression +/// ratio above. +/// * we don't know the distribution of these doc sizes yet. +pub const DOCUMENTATION_SIZE_BUCKETS: &[f64; 16] = &[ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0, 4096.0, 8192.0, + 16384.0, 32768.0, +]; + /// the measured times of building crates will be put into these buckets pub fn build_time_histogram_buckets() -> Vec { vec![ diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 6d8078f74..092c4a4ef 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -6,15 +6,22 @@ mod s3; pub use self::compression::{compress, decompress, CompressionAlgorithm, CompressionAlgorithms}; use self::database::DatabaseBackend; use self::s3::S3Backend; -use crate::{db::Pool, error::Result, utils::spawn_blocking, Config, InstanceMetrics}; -use anyhow::{anyhow, ensure}; +use crate::{ + db::{ + file::{detect_mime, FileEntry}, + Pool, + }, + error::Result, + utils::spawn_blocking, + Config, InstanceMetrics, +}; +use anyhow::anyhow; use chrono::{DateTime, Utc}; use fn_error_context::context; use futures_util::stream::BoxStream; use path_slash::PathExt; +use std::iter; use std::{ - collections::{HashMap, HashSet}, - ffi::OsStr, fmt, fs, io::{self, BufReader}, ops::RangeInclusive, @@ -23,6 +30,7 @@ use std::{ }; use tokio::{io::AsyncWriteExt, runtime::Runtime}; use tracing::{error, info_span, instrument, trace}; +use walkdir::WalkDir; type FileRange = RangeInclusive; @@ -45,40 +53,40 @@ impl Blob { } } -fn get_file_list_from_dir>(path: P, files: &mut Vec) -> Result<()> { - let path = path.as_ref(); - - for file in path.read_dir()? { - let file = file?; - - if file.file_type()?.is_file() { - files.push(file.path()); - } else if file.file_type()?.is_dir() { - get_file_list_from_dir(file.path(), files)?; - } - } - - Ok(()) -} - -#[instrument] -pub fn get_file_list + std::fmt::Debug>(path: P) -> Result> { - let path = path.as_ref(); - let mut files = Vec::new(); - - ensure!(path.exists(), "File not found"); - +pub fn get_file_list>(path: P) -> Box>> { + let path = path.as_ref().to_path_buf(); if path.is_file() { - files.push(PathBuf::from(path.file_name().unwrap())); + let path = if let Some(parent) = path.parent() { + path.strip_prefix(parent).unwrap().to_path_buf() + } else { + path + }; + + Box::new(iter::once(Ok(path))) } else if path.is_dir() { - get_file_list_from_dir(path, &mut files)?; - for file_path in &mut files { - // We want the paths in this list to not be {path}/bar.txt but just bar.txt - *file_path = PathBuf::from(file_path.strip_prefix(path).unwrap()); - } - } + Box::new( + WalkDir::new(path.clone()) + .into_iter() + .filter_map(move |result| { + let direntry = match result { + Ok(de) => de, + Err(err) => return Some(Err(err.into())), + }; - Ok(files) + if !direntry.file_type().is_dir() { + Some(Ok(direntry + .path() + .strip_prefix(&path) + .unwrap() + .to_path_buf())) + } else { + None + } + }), + ) + } else { + Box::new(iter::empty()) + } } #[derive(Debug, thiserror::Error)] @@ -382,7 +390,7 @@ impl AsyncStorage { &self, archive_path: &str, root_dir: &Path, - ) -> Result<(HashMap, CompressionAlgorithm)> { + ) -> Result<(Vec, CompressionAlgorithm)> { let (zip_content, compressed_index_content, alg, remote_index_path, file_paths) = spawn_blocking({ let archive_path = archive_path.to_owned(); @@ -390,7 +398,7 @@ impl AsyncStorage { let temp_dir = self.config.temp_dir.clone(); move || { - let mut file_paths = HashMap::new(); + let mut file_paths = Vec::new(); // We are only using the `zip` library to create the archives and the matching // index-file. The ZIP format allows more compression formats, and these can even be mixed @@ -411,14 +419,13 @@ impl AsyncStorage { .compression_method(zip::CompressionMethod::Bzip2); let mut zip = zip::ZipWriter::new(io::Cursor::new(Vec::new())); - for file_path in get_file_list(&root_dir)? { - let mut file = fs::File::open(root_dir.join(&file_path))?; + for file_path in get_file_list(&root_dir) { + let file_path = file_path?; + let mut file = fs::File::open(root_dir.join(&file_path))?; zip.start_file(file_path.to_str().unwrap(), options)?; io::copy(&mut file, &mut zip)?; - - let mime = detect_mime(&file_path); - file_paths.insert(file_path, mime.to_string()); + file_paths.push(FileEntry{path: file_path, size: file.metadata()?.len()}); } zip.finish()?.into_inner() @@ -468,61 +475,62 @@ impl AsyncStorage { ]) .await?; - let file_alg = CompressionAlgorithm::Bzip2; - Ok((file_paths, file_alg)) + Ok((file_paths, CompressionAlgorithm::Bzip2)) } - // Store all files in `root_dir` into the backend under `prefix`. - // - // This returns (map, set). + /// Store all files in `root_dir` into the backend under `prefix`. #[instrument(skip(self))] pub(crate) async fn store_all( &self, prefix: &Path, root_dir: &Path, - ) -> Result<(HashMap, HashSet)> { - let (blobs, file_paths_and_mimes, algs) = spawn_blocking({ + ) -> Result<(Vec, CompressionAlgorithm)> { + let alg = CompressionAlgorithm::default(); + + let (blobs, file_paths_and_mimes) = spawn_blocking({ let prefix = prefix.to_owned(); let root_dir = root_dir.to_owned(); move || { - let mut file_paths_and_mimes = HashMap::new(); - let mut algs = HashSet::with_capacity(1); - let blobs: Vec<_> = get_file_list(&root_dir)? - .into_iter() - .filter_map(|file_path| { - // Some files have insufficient permissions - // (like .lock file created by cargo in documentation directory). - // Skip these files. - fs::File::open(root_dir.join(&file_path)) - .ok() - .map(|file| (file_path, file)) - }) - .map(|(file_path, file)| -> Result<_> { - let alg = CompressionAlgorithm::default(); - let content = compress(file, alg)?; - let bucket_path = prefix.join(&file_path).to_slash().unwrap().to_string(); - - let mime = detect_mime(&file_path); - file_paths_and_mimes.insert(file_path, mime.to_string()); - algs.insert(alg); - - Ok(Blob { - path: bucket_path, - mime: mime.to_string(), - content, - compression: Some(alg), - // this field is ignored by the backend - date_updated: Utc::now(), - }) - }) - .collect::>>()?; - Ok((blobs, file_paths_and_mimes, algs)) + let mut file_paths = Vec::new(); + let mut blobs: Vec = Vec::new(); + for file_path in get_file_list(&root_dir) { + let file_path = file_path?; + + // Some files have insufficient permissions + // (like .lock file created by cargo in documentation directory). + // Skip these files. + let Ok(file) = fs::File::open(root_dir.join(&file_path)) else { + continue; + }; + + let file_size = file.metadata()?.len(); + + let content = compress(file, alg)?; + let bucket_path = prefix.join(&file_path).to_slash().unwrap().to_string(); + + let file_info = FileEntry { + path: file_path, + size: file_size, + }; + let mime = file_info.mime().to_string(); + file_paths.push(file_info); + + blobs.push(Blob { + path: bucket_path, + mime, + content, + compression: Some(alg), + // this field is ignored by the backend + date_updated: Utc::now(), + }); + } + Ok((blobs, file_paths)) } }) .await?; self.store_inner(blobs).await?; - Ok((file_paths_and_mimes, algs)) + Ok((file_paths_and_mimes, alg)) } #[cfg(test)] @@ -735,7 +743,7 @@ impl Storage { &self, archive_path: &str, root_dir: &Path, - ) -> Result<(HashMap, CompressionAlgorithm)> { + ) -> Result<(Vec, CompressionAlgorithm)> { self.runtime .block_on(self.inner.store_all_in_archive(archive_path, root_dir)) } @@ -744,7 +752,7 @@ impl Storage { &self, prefix: &Path, root_dir: &Path, - ) -> Result<(HashMap, HashSet)> { + ) -> Result<(Vec, CompressionAlgorithm)> { self.runtime .block_on(self.inner.store_all(prefix, root_dir)) } @@ -801,28 +809,6 @@ impl std::fmt::Debug for Storage { } } -fn detect_mime(file_path: impl AsRef) -> &'static str { - let mime = mime_guess::from_path(file_path.as_ref()) - .first_raw() - .unwrap_or("text/plain"); - match mime { - "text/plain" | "text/troff" | "text/x-markdown" | "text/x-rust" | "text/x-toml" => { - match file_path.as_ref().extension().and_then(OsStr::to_str) { - Some("md") => "text/markdown", - Some("rs") => "text/rust", - Some("markdown") => "text/markdown", - Some("css") => "text/css", - Some("toml") => "text/toml", - Some("js") => "text/javascript", - Some("json") => "application/json", - _ => mime, - } - } - "image/svg" => "image/svg+xml", - _ => mime, - } -} - pub(crate) fn rustdoc_archive_path(name: &str, version: &str) -> String { format!("rustdoc/{name}/{version}.zip") } @@ -837,14 +823,17 @@ mod test { use std::env; #[test] - fn test_get_file_list() { + fn test_get_file_list() -> Result<()> { crate::test::init_logger(); - let files = get_file_list(env::current_dir().unwrap()); - assert!(files.is_ok()); - assert!(!files.unwrap().is_empty()); + let dir = env::current_dir().unwrap(); - let files = get_file_list(env::current_dir().unwrap().join("Cargo.toml")).unwrap(); + let files: Vec<_> = get_file_list(&dir).collect::>>()?; + assert!(!files.is_empty()); + + let files: Vec<_> = get_file_list(dir.join("Cargo.toml")).collect::>>()?; assert_eq!(files[0], std::path::Path::new("Cargo.toml")); + + Ok(()) } #[test] @@ -879,6 +868,11 @@ mod test { mod backend_tests { use super::*; + fn get_file_info(files: &[FileEntry], path: impl AsRef) -> Option<&FileEntry> { + let path = path.as_ref(); + files.iter().find(|info| info.path == path) + } + fn test_exists(storage: &Storage) -> Result<()> { assert!(!storage.exists("path/to/file.txt").unwrap()); let blob = Blob { @@ -1138,15 +1132,14 @@ mod backend_tests { assert_eq!(compression_alg, CompressionAlgorithm::Bzip2); assert_eq!(stored_files.len(), files.len()); for name in &files { - let name = Path::new(name); - assert!(stored_files.contains_key(name)); + assert!(get_file_info(&stored_files, name).is_some()); } assert_eq!( - stored_files.get(Path::new("Cargo.toml")).unwrap(), + get_file_info(&stored_files, "Cargo.toml").unwrap().mime(), "text/toml" ); assert_eq!( - stored_files.get(Path::new("src/main.rs")).unwrap(), + get_file_info(&stored_files, "src/main.rs").unwrap().mime(), "text/rust" ); @@ -1194,15 +1187,14 @@ mod backend_tests { let (stored_files, algs) = storage.store_all(Path::new("prefix"), dir.path())?; assert_eq!(stored_files.len(), files.len()); for name in &files { - let name = Path::new(name); - assert!(stored_files.contains_key(name)); + assert!(get_file_info(&stored_files, name).is_some()); } assert_eq!( - stored_files.get(Path::new("Cargo.toml")).unwrap(), + get_file_info(&stored_files, "Cargo.toml").unwrap().mime(), "text/toml" ); assert_eq!( - stored_files.get(Path::new("src/main.rs")).unwrap(), + get_file_info(&stored_files, "src/main.rs").unwrap().mime(), "text/rust" ); @@ -1216,9 +1208,7 @@ mod backend_tests { assert_eq!(file.mime, "text/rust"); assert_eq!(file.path, "prefix/src/main.rs"); - let mut expected_algs = HashSet::new(); - expected_algs.insert(CompressionAlgorithm::default()); - assert_eq!(algs, expected_algs); + assert_eq!(algs, CompressionAlgorithm::default()); assert_eq!(2, metrics.uploaded_files_total.get()); diff --git a/src/test/fakes.rs b/src/test/fakes.rs index b6314c8c0..ceecd709a 100644 --- a/src/test/fakes.rs +++ b/src/test/fakes.rs @@ -1,19 +1,20 @@ use super::TestDatabase; +use crate::db::file::{file_list_to_json, FileEntry}; use crate::db::types::BuildStatus; use crate::db::{initialize_build, initialize_crate, initialize_release, update_build_status}; use crate::docbuilder::DocCoverage; use crate::error::Result; use crate::registry_api::{CrateData, CrateOwner, ReleaseData}; use crate::storage::{ - rustdoc_archive_path, source_archive_path, AsyncStorage, CompressionAlgorithms, + rustdoc_archive_path, source_archive_path, AsyncStorage, CompressionAlgorithm, }; use crate::utils::{Dependency, MetadataPackage, Target}; use anyhow::{bail, Context}; use base64::{engine::general_purpose::STANDARD as b64, Engine}; use chrono::{DateTime, Utc}; -use serde_json::Value; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; +use std::iter; use std::sync::Arc; use tokio::runtime::Runtime; use tracing::debug; @@ -390,7 +391,7 @@ impl<'a> FakeRelease<'a> { archive_storage: bool, package: &MetadataPackage, storage: &AsyncStorage, - ) -> Result<(Value, CompressionAlgorithms)> { + ) -> Result<(Vec, CompressionAlgorithm)> { debug!( "adding directory {:?} from {}", kind, @@ -413,9 +414,7 @@ impl<'a> FakeRelease<'a> { public, ) .await?; - let mut hm = HashSet::new(); - hm.insert(new_alg); - Ok((files_list, hm)) + Ok((files_list, new_alg)) } else { let prefix = match kind { FileKind::Rustdoc => "rustdoc", @@ -459,7 +458,7 @@ impl<'a> FakeRelease<'a> { &storage, ) .await?; - debug!("added source files {}", source_meta); + debug!(?source_meta, "added source files"); // If the test didn't add custom builds, inject a default one let builds = self.builds.unwrap_or_else(|| vec![FakeBuild::default()]); @@ -486,7 +485,7 @@ impl<'a> FakeRelease<'a> { debug!("added platform files for {}", platform); } - let (rustdoc_meta, _) = upload_files( + let (files, _) = upload_files( FileKind::Rustdoc, rustdoc_path, archive_storage, @@ -494,7 +493,7 @@ impl<'a> FakeRelease<'a> { &storage, ) .await?; - debug!("uploaded rustdoc files: {}", rustdoc_meta); + debug!(?files, "uploaded rustdoc files"); } let mut async_conn = db.async_conn().await; @@ -520,14 +519,15 @@ impl<'a> FakeRelease<'a> { &package, crate_dir, default_target, - source_meta, + file_list_to_json(source_meta), self.doc_targets, &self.registry_release_data, self.has_docs, self.has_examples, - algs, + iter::once(algs), repository, archive_storage, + 24, ) .await?; crate::db::update_crate_data_in_database( @@ -651,6 +651,7 @@ impl FakeBuild { &self.rustc_version, &self.docsrs_version, self.build_status, + Some(42), None, ) .await?; diff --git a/src/web/releases.rs b/src/web/releases.rs index c06e5e633..a0ce35535 100644 --- a/src/web/releases.rs +++ b/src/web/releases.rs @@ -887,6 +887,7 @@ mod tests { "docs.rs 4.0.0", BuildStatus::Success, None, + None, ) .await?;