diff --git a/Cargo.lock b/Cargo.lock index 71ecf2a2e6a..39266ccbe19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -183,6 +183,15 @@ version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "async-compression" version = "0.4.11" @@ -1029,6 +1038,7 @@ dependencies = [ "typomania", "unicode-xid", "url", + "zip", ] [[package]] @@ -1410,6 +1420,17 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "derive_builder" version = "0.20.0" @@ -1544,6 +1565,17 @@ dependencies = [ "subtle", ] +[[package]] +name = "displaydoc" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -2596,6 +2628,12 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "lockfree-object-pool" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" + [[package]] name = "log" version = "0.4.21" @@ -4093,6 +4131,12 @@ dependencies = [ "rand_core", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "similar" version = "2.5.0" @@ -5269,6 +5313,37 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +[[package]] +name = "zip" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd56a4d5921bc2f99947ac5b3abe5f510b1be7376fdc5e9fce4a23c6a93e87c" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "flate2", + "indexmap", + "memchr", + "thiserror", + "zopfli", +] + +[[package]] +name = "zopfli" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946" +dependencies = [ + "bumpalo", + "crc32fast", + "lockfree-object-pool", + "log", + "once_cell", + "simd-adler32", +] + [[package]] name = "zstd" version = "0.13.1" diff --git a/Cargo.toml b/Cargo.toml index 257109f0753..fd2edcb9a2d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -120,6 +120,7 @@ tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] } typomania = { version = "=0.1.2", default-features = false } url = "=2.5.0" unicode-xid = "=0.2.4" +zip = { version = "=2.1.1", default-features = false, features = ["deflate"] } [dev-dependencies] bytes = "=1.6.0" diff --git a/deny.toml b/deny.toml index 42c125915f4..75f2e61d858 100644 --- a/deny.toml +++ b/deny.toml @@ -100,6 +100,7 @@ allow = [ #"Apache-2.0 WITH LLVM-exception", "BSD-2-Clause", "BSD-3-Clause", + "BSL-1.0", "ISC", "MIT", "MPL-2.0", diff --git a/src/storage.rs b/src/storage.rs index cd7bb81055a..ef254da5017 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -19,7 +19,8 @@ const PREFIX_CRATES: &str = "crates"; const PREFIX_READMES: &str = "readmes"; const DEFAULT_REGION: &str = "us-west-1"; const CONTENT_TYPE_CRATE: &str = "application/gzip"; -const CONTENT_TYPE_DB_DUMP: &str = "application/gzip"; +const CONTENT_TYPE_GZIP: &str = "application/gzip"; +const CONTENT_TYPE_ZIP: &str = "application/zip"; const CONTENT_TYPE_INDEX: &str = "text/plain"; const CONTENT_TYPE_README: &str = "text/html"; const CACHE_CONTROL_IMMUTABLE: &str = "public,max-age=31536000,immutable"; @@ -126,7 +127,8 @@ impl Storage { // The `BufWriter::new()` API currently does not allow // specifying any file attributes, so we need to set the // content type here instead for the database dump upload. - .with_content_type_for_suffix("gz", CONTENT_TYPE_DB_DUMP); + .with_content_type_for_suffix("gz", CONTENT_TYPE_GZIP) + .with_content_type_for_suffix("zip", CONTENT_TYPE_ZIP); let store = build_s3(default, options); diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs index 71ccbf959a8..f42ce881e72 100644 --- a/src/tests/dump_db.rs +++ b/src/tests/dump_db.rs @@ -9,7 +9,7 @@ use insta::{assert_debug_snapshot, assert_snapshot}; use once_cell::sync::Lazy; use regex::Regex; use secrecy::ExposeSecret; -use std::io::Read; +use std::io::{Cursor, Read}; use tar::Archive; static PATH_DATE_RE: Lazy = Lazy::new(|| Regex::new(r"^\d{4}-\d{2}-\d{2}-\d{6}").unwrap()); @@ -28,8 +28,9 @@ async fn test_dump_db_job() { app.run_pending_background_jobs().await; let stored_files = app.stored_files().await; - assert_eq!(stored_files.len(), 1); + assert_eq!(stored_files.len(), 2); assert_eq!(stored_files[0], "db-dump.tar.gz"); + assert_eq!(stored_files[1], "db-dump.zip"); let path = object_store::path::Path::parse("db-dump.tar.gz").unwrap(); let result = app.as_inner().storage.as_inner().get(&path).await.unwrap(); @@ -65,6 +66,38 @@ async fn test_dump_db_job() { "YYYY-MM-DD-HHMMSS/data/version_downloads.csv", ] "###); + + let path = object_store::path::Path::parse("db-dump.zip").unwrap(); + let result = app.as_inner().storage.as_inner().get(&path).await.unwrap(); + let bytes = result.bytes().await.unwrap(); + + let archive = zip::ZipArchive::new(Cursor::new(bytes)).unwrap(); + let zip_paths = archive.file_names().collect::>(); + assert_debug_snapshot!(zip_paths, @r###" + [ + "README.md", + "export.sql", + "import.sql", + "metadata.json", + "schema.sql", + "data/", + "data/categories.csv", + "data/crate_downloads.csv", + "data/crates.csv", + "data/keywords.csv", + "data/metadata.csv", + "data/reserved_crate_names.csv", + "data/teams.csv", + "data/users.csv", + "data/crates_categories.csv", + "data/crates_keywords.csv", + "data/crate_owners.csv", + "data/versions.csv", + "data/default_versions.csv", + "data/dependencies.csv", + "data/version_downloads.csv", + ] + "###); } fn tar_paths(archive: &mut Archive) -> Vec { diff --git a/src/worker/jobs/dump_db.rs b/src/worker/jobs/dump_db.rs index b1b13a93ce3..8a273c6a1d9 100644 --- a/src/worker/jobs/dump_db.rs +++ b/src/worker/jobs/dump_db.rs @@ -6,6 +6,7 @@ use crates_io_worker::BackgroundJob; use std::fs::{self, File}; use std::path::{Path, PathBuf}; use std::sync::Arc; +use zip::write::SimpleFileOptions; #[derive(Clone, Serialize, Deserialize)] pub struct DumpDb { @@ -28,38 +29,60 @@ impl BackgroundJob for DumpDb { /// Create CSV dumps of the public information in the database, wrap them in a /// tarball and upload to S3. async fn run(&self, env: Self::Context) -> anyhow::Result<()> { - let target_name = "db-dump.tar.gz"; + const TAR_PATH: &str = "db-dump.tar.gz"; + const ZIP_PATH: &str = "db-dump.zip"; + let database_url = self.database_url.clone(); - let tarball = spawn_blocking(move || { + let archives = spawn_blocking(move || { let directory = DumpDirectory::create()?; - info!("Begin exporting database"); + info!("Exporting database…"); directory.populate(&database_url)?; let export_dir = directory.path(); - info!(path = ?export_dir, "Creating tarball"); - let prefix = PathBuf::from(directory.timestamp.format("%Y-%m-%d-%H%M%S").to_string()); - create_tarball(export_dir, &prefix) + info!(path = ?export_dir, "Creating tarball…"); + let tarball_prefix = + PathBuf::from(directory.timestamp.format("%Y-%m-%d-%H%M%S").to_string()); + create_archives(export_dir, &tarball_prefix) }) .await?; - info!("Uploading tarball"); + info!("Uploading tarball…"); env.storage - .upload_db_dump(target_name, tarball.path()) + .upload_db_dump(TAR_PATH, archives.tar.path()) .await?; info!("Database dump tarball uploaded"); - info!("Invalidating CDN caches"); + info!("Invalidating CDN caches…"); + if let Some(cloudfront) = env.cloudfront() { + if let Err(error) = cloudfront.invalidate(TAR_PATH).await { + warn!("Failed to invalidate CloudFront cache: {}", error); + } + } + + if let Some(fastly) = env.fastly() { + if let Err(error) = fastly.invalidate(TAR_PATH).await { + warn!("Failed to invalidate Fastly cache: {}", error); + } + } + + info!("Uploading zip file…"); + env.storage + .upload_db_dump(ZIP_PATH, archives.zip.path()) + .await?; + info!("Database dump zip file uploaded"); + + info!("Invalidating CDN caches…"); if let Some(cloudfront) = env.cloudfront() { - if let Err(error) = cloudfront.invalidate(target_name).await { - warn!("failed to invalidate CloudFront cache: {}", error); + if let Err(error) = cloudfront.invalidate(ZIP_PATH).await { + warn!("Failed to invalidate CloudFront cache: {}", error); } } if let Some(fastly) = env.fastly() { - if let Err(error) = fastly.invalidate(target_name).await { - warn!("failed to invalidate Fastly cache: {}", error); + if let Err(error) = fastly.invalidate(ZIP_PATH).await { + warn!("Failed to invalidate Fastly cache: {}", error); } } @@ -202,15 +225,24 @@ pub fn run_psql(script: &Path, database_url: &str) -> anyhow::Result<()> { Ok(()) } -fn create_tarball(export_dir: &Path, prefix: &Path) -> anyhow::Result { - debug!("Creating tarball file"); - let tempfile = tempfile::NamedTempFile::new()?; - let encoder = flate2::write::GzEncoder::new(tempfile.as_file(), flate2::Compression::default()); +struct Archives { + tar: tempfile::NamedTempFile, + zip: tempfile::NamedTempFile, +} + +fn create_archives(export_dir: &Path, tarball_prefix: &Path) -> anyhow::Result { + debug!("Creating tarball file…"); + let tar_tempfile = tempfile::NamedTempFile::new()?; + let encoder = + flate2::write::GzEncoder::new(tar_tempfile.as_file(), flate2::Compression::default()); + let mut tar = tar::Builder::new(encoder); - let mut archive = tar::Builder::new(encoder); + debug!("Creating zip file…"); + let zip_tempfile = tempfile::NamedTempFile::new()?; + let mut zip = zip::ZipWriter::new(zip_tempfile.as_file()); - debug!(path = ?prefix, "Appending directory to tarball"); - archive.append_dir(prefix, export_dir)?; + debug!("Appending `{tarball_prefix:?}` directory to tarball…"); + tar.append_dir(tarball_prefix, export_dir)?; // Append readme, metadata, schemas. let mut paths = Vec::new(); @@ -224,9 +256,13 @@ fn create_tarball(export_dir: &Path, prefix: &Path) -> anyhow::Result anyhow::Result>(); + assert_debug_snapshot!(zip_paths, @r###" + [ + "README.md", + "data/", + "data/crates.csv", + "data/users.csv", + "data/crate_owners.csv", + ] + "###); } }