diff --git a/Cargo.toml b/Cargo.toml index bf55e55..6739dc8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,34 +1,35 @@ [package] name = "cc-downloader" -version = "0.5.2" -edition = "2021" +version = "0.6.0" +edition = "2024" authors = ["Pedro Ortiz Suarez "] description = "A polite and user-friendly downloader for Common Crawl data." license = "MIT OR Apache-2.0" -rust-version = "1.83" +rust-version = "1.85" readme = "README.md" homepage = "https://commoncrawl.org" repository = "https://github.com/commoncrawl/cc-downloader" documentation = "https://docs.rs/cc-downloader" [dependencies] -clap = { version = "4.5.29", features = ["derive"] } -flate2 = "1.0.35" +clap = { version = "4.5.32", features = ["derive"] } +flate2 = "1.1.0" futures = "0.3.31" indicatif = "0.17.11" -reqwest = { version = "0.12.12", default-features = false, features = [ +regex = "1.11.1" +reqwest = { version = "0.12.14", default-features = false, features = [ "stream", "rustls-tls", ] } -reqwest-middleware = "0.4.0" +reqwest-middleware = "0.4.1" reqwest-retry = "0.7.0" -tokio = { version = "1.43.0", features = ["full"] } -tokio-util = { version = "0.7.13", features = ["compat"] } +tokio = { version = "1.44.1", features = ["full"] } +tokio-util = { version = "0.7.14", features = ["compat"] } url = "2.5.4" [dev-dependencies] -serde = { version = "1.0.217", features = ["derive"] } -reqwest = { version = "0.12.12", default-features = false, features = [ +serde = { version = "1.0.219", features = ["derive"] } +reqwest = { version = "0.12.14", default-features = false, features = [ "stream", "rustls-tls", "json", diff --git a/README.md b/README.md index 6b657d2..7d653d5 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Download paths for a given crawl Usage: cc-downloader download-paths Arguments: - Crawl reference, e.g. CC-MAIN-2021-04 + Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01 Data type [possible values: segment, warc, wat, wet, robotstxt, non200responses, cc-index, cc-index-table] Destination folder diff --git a/SECURITY.md b/SECURITY.md index 6480dce..29c9414 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -6,8 +6,8 @@ Only the latest minor version is being supported | Version | Supported | | ------- | ------------------ | -| 0.5.x | :white_check_mark: | -| < 0.5.0 | :x: | +| 0.6.x | :white_check_mark: | +| < 0.6.0 | :x: | ## Reporting a Vulnerability diff --git a/src/cli.rs b/src/cli.rs index 20b8144..38d5dad 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,6 +1,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand, ValueEnum}; +use regex::Regex; #[derive(Parser)] #[command(version, about, long_about = None)] @@ -13,8 +14,8 @@ pub struct Cli { pub enum Commands { /// Download paths for a given crawl DownloadPaths { - /// Crawl reference, e.g. CC-MAIN-2021-04 - #[arg(value_name = "CRAWL")] + /// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01 + #[arg(value_name = "CRAWL", value_parser = crawl_name_format)] snapshot: String, /// Data type @@ -89,3 +90,16 @@ impl DataType { } } } + +fn crawl_name_format(crawl: &str) -> Result { + let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap(); + let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap(); + + let crawl_ref = crawl.to_uppercase(); + + if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) { + Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string()) + } else { + Ok(crawl_ref) + } +} diff --git a/src/download.rs b/src/download.rs index 9e6a5ee..e4903ca 100644 --- a/src/download.rs +++ b/src/download.rs @@ -1,8 +1,9 @@ use flate2::read::GzDecoder; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; -use reqwest::{header, Client, Url}; +use regex::Regex; +use reqwest::{Client, Url, header}; use reqwest_middleware::{ClientBuilder, ClientWithMiddleware}; -use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware}; +use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff}; use std::{ fs::File, io::{BufRead, BufReader}, @@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result .build()) } -pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> { +pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> { + let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap(); + + // Check if the snapshot is a news snapshot and reformat it + // The format of the main crawl urls is different from the news crawl urls + // https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz + // https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz + let snapshot_original_ref = options.snapshot.clone(); + if news_re.is_match(&options.snapshot) { + let caps = news_re.captures(&options.snapshot).unwrap(); + options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]); + } let paths = format!( "{}crawl-data/{}/{}.paths.gz", BASE_URL, options.snapshot, options.data_type @@ -89,6 +101,27 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download .and_then(|segments| segments.last()) // Retrieves the last segment .unwrap_or("file.download"); // Fallback to generic filename + let resp = client.head(url.as_str()).send().await?; + match resp.status() { + status if status.is_success() => (), + status if status.as_u16() == 404 => { + return Err(format!( + "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError code: {} {}", + snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("") + ) + .into()); + } + status => { + return Err(format!( + "Couldn't download URL: {}. Error code: {} {}", + url, + status.as_str(), + status.canonical_reason().unwrap_or("") + ) + .into()); + } + } + let request = client.get(url.as_str()); let mut dst = options.dst.to_path_buf(); @@ -134,7 +167,7 @@ async fn download_task( } else { // We return an Error if something goes wrong here return Err( - format!("Couldn't download URL: {}. Error: {:?}", url, resp.status(),).into(), + format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(), ); } };