feat: Adds explicit support for CC-NEWS, adds more details and fixes the 4XX error message when downloading paths, adds validation to the cli input for the crawl reference

pjox · pjox · commit aed4ae6472b7 · 2025-03-06T15:09:02.000+01:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,6 +16,7 @@ clap = { version = "4.5.31", features = ["derive"] }
 flate2 = "1.1.0"
 futures = "0.3.31"
 indicatif = "0.17.11"
+regex = "1.11.1"
 reqwest = { version = "0.12.12", default-features = false, features = [
     "stream",
     "rustls-tls",
diff --git a/src/cli.rs b/src/cli.rs
@@ -1,6 +1,7 @@
 use std::path::PathBuf;
 
 use clap::{Parser, Subcommand, ValueEnum};
+use regex::Regex;
 
 #[derive(Parser)]
 #[command(version, about, long_about = None)]
@@ -13,8 +14,8 @@ pub struct Cli {
 pub enum Commands {
     /// Download paths for a given crawl
     DownloadPaths {
-        /// Crawl reference, e.g. CC-MAIN-2021-04
-        #[arg(value_name = "CRAWL")]
+        /// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
+        #[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
         snapshot: String,
 
         /// Data type
@@ -89,3 +90,16 @@ impl DataType {
         }
     }
 }
+
+//https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
+
+fn crawl_name_format(crawl: &str) -> Result<String, String> {
+    let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+    let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+
+    if !(main_re.is_match(crawl) || news_re.is_match(crawl)) {
+        return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format, make sure your input is propely capitalized".to_string());
+    } else {
+        return Ok(crawl.to_owned());
+    }
+}
diff --git a/src/download.rs b/src/download.rs
@@ -1,8 +1,9 @@
 use flate2::read::GzDecoder;
 use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
-use reqwest::{header, Client, Url};
+use regex::Regex;
+use reqwest::{Client, Url, header};
 use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
-use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware};
+use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff};
 use std::{
     fs::File,
     io::{BufRead, BufReader},
@@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError>
         .build())
 }
 
-pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
+pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> {
+    let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+
+    // Check if the snapshot is a news snapshot and reformat it
+    // The format of the main crawl urls is different from the news crawl urls
+    // https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
+    // https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz
+    let snapshot_original_ref = options.snapshot.clone();
+    if news_re.is_match(&options.snapshot) {
+        let caps = news_re.captures(&options.snapshot).unwrap();
+        options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]);
+    }
     let paths = format!(
         "{}crawl-data/{}/{}.paths.gz",
         BASE_URL, options.snapshot, options.data_type
@@ -94,15 +106,15 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
         status if status.is_success() => (),
         status if status.is_client_error() => {
             return Err(format!(
-                "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET:{}\n\tULR: {}\n\nDoesn't seem to exist or it is not accessible.\n\tError Code: {:?}",
-                options.snapshot, options.data_type, url, status
+                "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError Code: {} {}",
+                snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("")
             )
             .into());
         }
         _ => {
             return Err(
                 format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
-            )
+            );
         }
     }