Merge pull request #12 from commoncrawl/dev

pjox · web-flow · commit 5fb6ff4a6cff · 2025-03-18T13:58:46.000+01:00
Add support for CC-NEWS and validation for crawl reference on the CLI interface
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,34 +1,35 @@
 [package]
 name = "cc-downloader"
-version = "0.5.2"
-edition = "2021"
+version = "0.6.0"
+edition = "2024"
 authors = ["Pedro Ortiz Suarez <pedro@commoncrawl.org>"]
 description = "A polite and user-friendly downloader for Common Crawl data."
 license = "MIT OR Apache-2.0"
-rust-version = "1.83"
+rust-version = "1.85"
 readme = "README.md"
 homepage = "https://commoncrawl.org"
 repository = "https://github.com/commoncrawl/cc-downloader"
 documentation = "https://docs.rs/cc-downloader"
 
 [dependencies]
-clap = { version = "4.5.29", features = ["derive"] }
-flate2 = "1.0.35"
+clap = { version = "4.5.32", features = ["derive"] }
+flate2 = "1.1.0"
 futures = "0.3.31"
 indicatif = "0.17.11"
-reqwest = { version = "0.12.12", default-features = false, features = [
+regex = "1.11.1"
+reqwest = { version = "0.12.14", default-features = false, features = [
     "stream",
     "rustls-tls",
 ] }
-reqwest-middleware = "0.4.0"
+reqwest-middleware = "0.4.1"
 reqwest-retry = "0.7.0"
-tokio = { version = "1.43.0", features = ["full"] }
-tokio-util = { version = "0.7.13", features = ["compat"] }
+tokio = { version = "1.44.1", features = ["full"] }
+tokio-util = { version = "0.7.14", features = ["compat"] }
 url = "2.5.4"
 
 [dev-dependencies]
-serde = { version = "1.0.217", features = ["derive"] }
-reqwest = { version = "0.12.12", default-features = false, features = [
+serde = { version = "1.0.219", features = ["derive"] }
+reqwest = { version = "0.12.14", default-features = false, features = [
     "stream",
     "rustls-tls",
     "json",
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ Download paths for a given crawl
 Usage: cc-downloader download-paths <CRAWL> <SUBSET> <DESTINATION>
 
 Arguments:
-  <CRAWL>        Crawl reference, e.g. CC-MAIN-2021-04
+  <CRAWL>        Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
   <SUBSET>       Data type [possible values: segment, warc, wat, wet, robotstxt, non200responses, cc-index, cc-index-table]
   <DESTINATION>  Destination folder
 
diff --git a/SECURITY.md b/SECURITY.md
@@ -6,8 +6,8 @@ Only the latest minor version is being supported
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 0.5.x   | :white_check_mark: |
-| < 0.5.0 | :x:                |
+| 0.6.x   | :white_check_mark: |
+| < 0.6.0 | :x:                |
 
 ## Reporting a Vulnerability
 
diff --git a/src/cli.rs b/src/cli.rs
@@ -1,6 +1,7 @@
 use std::path::PathBuf;
 
 use clap::{Parser, Subcommand, ValueEnum};
+use regex::Regex;
 
 #[derive(Parser)]
 #[command(version, about, long_about = None)]
@@ -13,8 +14,8 @@ pub struct Cli {
 pub enum Commands {
     /// Download paths for a given crawl
     DownloadPaths {
-        /// Crawl reference, e.g. CC-MAIN-2021-04
-        #[arg(value_name = "CRAWL")]
+        /// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
+        #[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
         snapshot: String,
 
         /// Data type
@@ -89,3 +90,16 @@ impl DataType {
         }
     }
 }
+
+fn crawl_name_format(crawl: &str) -> Result<String, String> {
+    let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+    let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+
+    let crawl_ref = crawl.to_uppercase();
+
+    if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) {
+        Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string())
+    } else {
+        Ok(crawl_ref)
+    }
+}
diff --git a/src/download.rs b/src/download.rs
@@ -1,8 +1,9 @@
 use flate2::read::GzDecoder;
 use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
-use reqwest::{header, Client, Url};
+use regex::Regex;
+use reqwest::{Client, Url, header};
 use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
-use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware};
+use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff};
 use std::{
     fs::File,
     io::{BufRead, BufReader},
@@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError>
         .build())
 }
 
-pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
+pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> {
+    let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+
+    // Check if the snapshot is a news snapshot and reformat it
+    // The format of the main crawl urls is different from the news crawl urls
+    // https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
+    // https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz
+    let snapshot_original_ref = options.snapshot.clone();
+    if news_re.is_match(&options.snapshot) {
+        let caps = news_re.captures(&options.snapshot).unwrap();
+        options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]);
+    }
     let paths = format!(
         "{}crawl-data/{}/{}.paths.gz",
         BASE_URL, options.snapshot, options.data_type
@@ -89,6 +101,27 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
         .and_then(|segments| segments.last()) // Retrieves the last segment
         .unwrap_or("file.download"); // Fallback to generic filename
 
+    let resp = client.head(url.as_str()).send().await?;
+    match resp.status() {
+        status if status.is_success() => (),
+        status if status.as_u16() == 404 => {
+            return Err(format!(
+                "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError code: {} {}",
+                snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("")
+            )
+            .into());
+        }
+        status => {
+            return Err(format!(
+                "Couldn't download URL: {}. Error code: {} {}",
+                url,
+                status.as_str(),
+                status.canonical_reason().unwrap_or("")
+            )
+            .into());
+        }
+    }
+
     let request = client.get(url.as_str());
 
     let mut dst = options.dst.to_path_buf();
@@ -134,7 +167,7 @@ async fn download_task(
         } else {
             // We return an Error if something goes wrong here
             return Err(
-                format!("Couldn't download URL: {}. Error: {:?}", url, resp.status(),).into(),
+                format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
             );
         }
     };