From 0e388440e29907b7ee9802bd8cfaa3b70056b17d Mon Sep 17 00:00:00 2001 From: Pedro Ortiz Suarez Date: Thu, 6 Mar 2025 13:53:43 +0100 Subject: [PATCH 1/7] fix: This commit fixes the error where users request a crawl that doesn't exist and cc-downloader downloaded the body of the response Now this action will produce an error --- Cargo.toml | 12 ++++++------ src/download.rs | 19 ++++++++++++++++++- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bf55e55..929a4ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,33 +1,33 @@ [package] name = "cc-downloader" version = "0.5.2" -edition = "2021" +edition = "2024" authors = ["Pedro Ortiz Suarez "] description = "A polite and user-friendly downloader for Common Crawl data." license = "MIT OR Apache-2.0" -rust-version = "1.83" +rust-version = "1.85" readme = "README.md" homepage = "https://commoncrawl.org" repository = "https://github.com/commoncrawl/cc-downloader" documentation = "https://docs.rs/cc-downloader" [dependencies] -clap = { version = "4.5.29", features = ["derive"] } -flate2 = "1.0.35" +clap = { version = "4.5.31", features = ["derive"] } +flate2 = "1.1.0" futures = "0.3.31" indicatif = "0.17.11" reqwest = { version = "0.12.12", default-features = false, features = [ "stream", "rustls-tls", ] } -reqwest-middleware = "0.4.0" +reqwest-middleware = "0.4.1" reqwest-retry = "0.7.0" tokio = { version = "1.43.0", features = ["full"] } tokio-util = { version = "0.7.13", features = ["compat"] } url = "2.5.4" [dev-dependencies] -serde = { version = "1.0.217", features = ["derive"] } +serde = { version = "1.0.218", features = ["derive"] } reqwest = { version = "0.12.12", default-features = false, features = [ "stream", "rustls-tls", diff --git a/src/download.rs b/src/download.rs index 9e6a5ee..09f0d5f 100644 --- a/src/download.rs +++ b/src/download.rs @@ -89,6 +89,23 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download .and_then(|segments| segments.last()) // Retrieves the last segment .unwrap_or("file.download"); // Fallback to generic filename + let resp = client.head(url.as_str()).send().await?; + match resp.status() { + status if status.is_success() => (), + status if status.is_client_error() => { + return Err(format!( + "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET:{}\n\tULR: {}\n\nDoesn't seem to exist or it is not accessible.\n\tError Code: {:?}", + options.snapshot, options.data_type, url, status + ) + .into()); + } + _ => { + return Err( + format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(), + ) + } + } + let request = client.get(url.as_str()); let mut dst = options.dst.to_path_buf(); @@ -134,7 +151,7 @@ async fn download_task( } else { // We return an Error if something goes wrong here return Err( - format!("Couldn't download URL: {}. Error: {:?}", url, resp.status(),).into(), + format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(), ); } }; From aed4ae6472b7e06b8cdb5703b373e5ad48e93d3f Mon Sep 17 00:00:00 2001 From: Pedro Ortiz Suarez Date: Thu, 6 Mar 2025 15:09:02 +0100 Subject: [PATCH 2/7] feat: Adds explicit support for CC-NEWS, adds more details and fixes the 4XX error message when downloading paths, adds validation to the cli input for the crawl reference --- Cargo.toml | 1 + src/cli.rs | 18 ++++++++++++++++-- src/download.rs | 24 ++++++++++++++++++------ 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 929a4ef..808f33b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ clap = { version = "4.5.31", features = ["derive"] } flate2 = "1.1.0" futures = "0.3.31" indicatif = "0.17.11" +regex = "1.11.1" reqwest = { version = "0.12.12", default-features = false, features = [ "stream", "rustls-tls", diff --git a/src/cli.rs b/src/cli.rs index 20b8144..c70c274 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,6 +1,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand, ValueEnum}; +use regex::Regex; #[derive(Parser)] #[command(version, about, long_about = None)] @@ -13,8 +14,8 @@ pub struct Cli { pub enum Commands { /// Download paths for a given crawl DownloadPaths { - /// Crawl reference, e.g. CC-MAIN-2021-04 - #[arg(value_name = "CRAWL")] + /// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01 + #[arg(value_name = "CRAWL", value_parser = crawl_name_format)] snapshot: String, /// Data type @@ -89,3 +90,16 @@ impl DataType { } } } + +//https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz + +fn crawl_name_format(crawl: &str) -> Result { + let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap(); + let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap(); + + if !(main_re.is_match(crawl) || news_re.is_match(crawl)) { + return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format, make sure your input is propely capitalized".to_string()); + } else { + return Ok(crawl.to_owned()); + } +} diff --git a/src/download.rs b/src/download.rs index 09f0d5f..50eb13f 100644 --- a/src/download.rs +++ b/src/download.rs @@ -1,8 +1,9 @@ use flate2::read::GzDecoder; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; -use reqwest::{header, Client, Url}; +use regex::Regex; +use reqwest::{Client, Url, header}; use reqwest_middleware::{ClientBuilder, ClientWithMiddleware}; -use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware}; +use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff}; use std::{ fs::File, io::{BufRead, BufReader}, @@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result .build()) } -pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> { +pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> { + let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap(); + + // Check if the snapshot is a news snapshot and reformat it + // The format of the main crawl urls is different from the news crawl urls + // https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz + // https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz + let snapshot_original_ref = options.snapshot.clone(); + if news_re.is_match(&options.snapshot) { + let caps = news_re.captures(&options.snapshot).unwrap(); + options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]); + } let paths = format!( "{}crawl-data/{}/{}.paths.gz", BASE_URL, options.snapshot, options.data_type @@ -94,15 +106,15 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download status if status.is_success() => (), status if status.is_client_error() => { return Err(format!( - "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET:{}\n\tULR: {}\n\nDoesn't seem to exist or it is not accessible.\n\tError Code: {:?}", - options.snapshot, options.data_type, url, status + "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError Code: {} {}", + snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("") ) .into()); } _ => { return Err( format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(), - ) + ); } } From 05ce83a58203f3525a0c4fdc088cc6c5384967ab Mon Sep 17 00:00:00 2001 From: Pedro Ortiz Suarez Date: Thu, 6 Mar 2025 15:29:35 +0100 Subject: [PATCH 3/7] docs: removed unused comment from code --- src/cli.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index c70c274..19a795f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -91,8 +91,6 @@ impl DataType { } } -//https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz - fn crawl_name_format(crawl: &str) -> Result { let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap(); let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap(); From b7ed9e96169a460a6237495c49c7c4d780c4315e Mon Sep 17 00:00:00 2001 From: Pedro Ortiz Suarez Date: Thu, 6 Mar 2025 16:54:35 +0100 Subject: [PATCH 4/7] fix: Added custom error message only for the 404 case and a method to automatically fix the casing of the crawl reference --- src/cli.rs | 8 +++++--- src/download.rs | 16 ++++++++++------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 19a795f..b8cbf5b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -95,9 +95,11 @@ fn crawl_name_format(crawl: &str) -> Result { let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap(); let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap(); - if !(main_re.is_match(crawl) || news_re.is_match(crawl)) { - return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format, make sure your input is propely capitalized".to_string()); + let crawl_ref = crawl.to_uppercase(); + + if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) { + return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string()); } else { - return Ok(crawl.to_owned()); + return Ok(crawl_ref); } } diff --git a/src/download.rs b/src/download.rs index 50eb13f..e4903ca 100644 --- a/src/download.rs +++ b/src/download.rs @@ -104,17 +104,21 @@ pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), Down let resp = client.head(url.as_str()).send().await?; match resp.status() { status if status.is_success() => (), - status if status.is_client_error() => { + status if status.as_u16() == 404 => { return Err(format!( - "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError Code: {} {}", + "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError code: {} {}", snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("") ) .into()); } - _ => { - return Err( - format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(), - ); + status => { + return Err(format!( + "Couldn't download URL: {}. Error code: {} {}", + url, + status.as_str(), + status.canonical_reason().unwrap_or("") + ) + .into()); } } From 83b5445647810037b06bbf65a3821ef3ce8b7af1 Mon Sep 17 00:00:00 2001 From: Pedro Ortiz Suarez Date: Thu, 6 Mar 2025 17:08:55 +0100 Subject: [PATCH 5/7] fix: fix 2 linter warnings that were not caught on the last commit --- src/cli.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index b8cbf5b..38d5dad 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -98,8 +98,8 @@ fn crawl_name_format(crawl: &str) -> Result { let crawl_ref = crawl.to_uppercase(); if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) { - return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string()); + Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string()) } else { - return Ok(crawl_ref); + Ok(crawl_ref) } } From 19d17b4474471ada3bc54939336cb0266f4ad020 Mon Sep 17 00:00:00 2001 From: Pedro Ortiz Suarez Date: Fri, 7 Mar 2025 09:54:10 +0100 Subject: [PATCH 6/7] chore: Bumped the version number in the Cargo.toml and the SECURITY.md files and updated the README.md in order to prepare the next release --- Cargo.toml | 2 +- README.md | 2 +- SECURITY.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 808f33b..63ecdda 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cc-downloader" -version = "0.5.2" +version = "0.6.0" edition = "2024" authors = ["Pedro Ortiz Suarez "] description = "A polite and user-friendly downloader for Common Crawl data." diff --git a/README.md b/README.md index 6b657d2..7d653d5 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Download paths for a given crawl Usage: cc-downloader download-paths Arguments: - Crawl reference, e.g. CC-MAIN-2021-04 + Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01 Data type [possible values: segment, warc, wat, wet, robotstxt, non200responses, cc-index, cc-index-table] Destination folder diff --git a/SECURITY.md b/SECURITY.md index 6480dce..29c9414 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -6,8 +6,8 @@ Only the latest minor version is being supported | Version | Supported | | ------- | ------------------ | -| 0.5.x | :white_check_mark: | -| < 0.5.0 | :x: | +| 0.6.x | :white_check_mark: | +| < 0.6.0 | :x: | ## Reporting a Vulnerability From b19e4bf9f79f295daaf4284d83ccd55ffab66abd Mon Sep 17 00:00:00 2001 From: Pedro Ortiz Suarez Date: Thu, 13 Mar 2025 23:32:18 +0100 Subject: [PATCH 7/7] fix: bump versions of some dependencies, and resolve the problem with the reqwest deprecated API TODO: We need to monitor the the open PRs in reqwest-middleware and bump the version of it here as soon as they are merged --- Cargo.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 63ecdda..6739dc8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,24 +12,24 @@ repository = "https://github.com/commoncrawl/cc-downloader" documentation = "https://docs.rs/cc-downloader" [dependencies] -clap = { version = "4.5.31", features = ["derive"] } +clap = { version = "4.5.32", features = ["derive"] } flate2 = "1.1.0" futures = "0.3.31" indicatif = "0.17.11" regex = "1.11.1" -reqwest = { version = "0.12.12", default-features = false, features = [ +reqwest = { version = "0.12.14", default-features = false, features = [ "stream", "rustls-tls", ] } reqwest-middleware = "0.4.1" reqwest-retry = "0.7.0" -tokio = { version = "1.43.0", features = ["full"] } -tokio-util = { version = "0.7.13", features = ["compat"] } +tokio = { version = "1.44.1", features = ["full"] } +tokio-util = { version = "0.7.14", features = ["compat"] } url = "2.5.4" [dev-dependencies] -serde = { version = "1.0.218", features = ["derive"] } -reqwest = { version = "0.12.12", default-features = false, features = [ +serde = { version = "1.0.219", features = ["derive"] } +reqwest = { version = "0.12.14", default-features = false, features = [ "stream", "rustls-tls", "json",