From 0e388440e29907b7ee9802bd8cfaa3b70056b17d Mon Sep 17 00:00:00 2001
From: Pedro Ortiz Suarez <pedro@commoncrawl.org>
Date: Thu, 6 Mar 2025 13:53:43 +0100
Subject: [PATCH 1/7] fix: This commit fixes the error where users request a
 crawl that doesn't exist and cc-downloader downloaded the body of the
 response

Now this action will produce an error
---
 Cargo.toml      | 12 ++++++------
 src/download.rs | 19 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index bf55e55..929a4ef 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,33 +1,33 @@
 [package]
 name = "cc-downloader"
 version = "0.5.2"
-edition = "2021"
+edition = "2024"
 authors = ["Pedro Ortiz Suarez <pedro@commoncrawl.org>"]
 description = "A polite and user-friendly downloader for Common Crawl data."
 license = "MIT OR Apache-2.0"
-rust-version = "1.83"
+rust-version = "1.85"
 readme = "README.md"
 homepage = "https://commoncrawl.org"
 repository = "https://github.com/commoncrawl/cc-downloader"
 documentation = "https://docs.rs/cc-downloader"
 
 [dependencies]
-clap = { version = "4.5.29", features = ["derive"] }
-flate2 = "1.0.35"
+clap = { version = "4.5.31", features = ["derive"] }
+flate2 = "1.1.0"
 futures = "0.3.31"
 indicatif = "0.17.11"
 reqwest = { version = "0.12.12", default-features = false, features = [
     "stream",
     "rustls-tls",
 ] }
-reqwest-middleware = "0.4.0"
+reqwest-middleware = "0.4.1"
 reqwest-retry = "0.7.0"
 tokio = { version = "1.43.0", features = ["full"] }
 tokio-util = { version = "0.7.13", features = ["compat"] }
 url = "2.5.4"
 
 [dev-dependencies]
-serde = { version = "1.0.217", features = ["derive"] }
+serde = { version = "1.0.218", features = ["derive"] }
 reqwest = { version = "0.12.12", default-features = false, features = [
     "stream",
     "rustls-tls",
diff --git a/src/download.rs b/src/download.rs
index 9e6a5ee..09f0d5f 100644
--- a/src/download.rs
+++ b/src/download.rs
@@ -89,6 +89,23 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
         .and_then(|segments| segments.last()) // Retrieves the last segment
         .unwrap_or("file.download"); // Fallback to generic filename
 
+    let resp = client.head(url.as_str()).send().await?;
+    match resp.status() {
+        status if status.is_success() => (),
+        status if status.is_client_error() => {
+            return Err(format!(
+                "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET:{}\n\tULR: {}\n\nDoesn't seem to exist or it is not accessible.\n\tError Code: {:?}",
+                options.snapshot, options.data_type, url, status
+            )
+            .into());
+        }
+        _ => {
+            return Err(
+                format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
+            )
+        }
+    }
+
     let request = client.get(url.as_str());
 
     let mut dst = options.dst.to_path_buf();
@@ -134,7 +151,7 @@ async fn download_task(
         } else {
             // We return an Error if something goes wrong here
             return Err(
-                format!("Couldn't download URL: {}. Error: {:?}", url, resp.status(),).into(),
+                format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
             );
         }
     };

From aed4ae6472b7e06b8cdb5703b373e5ad48e93d3f Mon Sep 17 00:00:00 2001
From: Pedro Ortiz Suarez <pedro@commoncrawl.org>
Date: Thu, 6 Mar 2025 15:09:02 +0100
Subject: [PATCH 2/7] feat: Adds explicit support for CC-NEWS, adds more
 details and fixes the 4XX error message when downloading paths, adds
 validation to the cli input for the crawl reference

---
 Cargo.toml      |  1 +
 src/cli.rs      | 18 ++++++++++++++++--
 src/download.rs | 24 ++++++++++++++++++------
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 929a4ef..808f33b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,7 @@ clap = { version = "4.5.31", features = ["derive"] }
 flate2 = "1.1.0"
 futures = "0.3.31"
 indicatif = "0.17.11"
+regex = "1.11.1"
 reqwest = { version = "0.12.12", default-features = false, features = [
     "stream",
     "rustls-tls",
diff --git a/src/cli.rs b/src/cli.rs
index 20b8144..c70c274 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -1,6 +1,7 @@
 use std::path::PathBuf;
 
 use clap::{Parser, Subcommand, ValueEnum};
+use regex::Regex;
 
 #[derive(Parser)]
 #[command(version, about, long_about = None)]
@@ -13,8 +14,8 @@ pub struct Cli {
 pub enum Commands {
     /// Download paths for a given crawl
     DownloadPaths {
-        /// Crawl reference, e.g. CC-MAIN-2021-04
-        #[arg(value_name = "CRAWL")]
+        /// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
+        #[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
         snapshot: String,
 
         /// Data type
@@ -89,3 +90,16 @@ impl DataType {
         }
     }
 }
+
+//https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
+
+fn crawl_name_format(crawl: &str) -> Result<String, String> {
+    let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+    let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+
+    if !(main_re.is_match(crawl) || news_re.is_match(crawl)) {
+        return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format, make sure your input is propely capitalized".to_string());
+    } else {
+        return Ok(crawl.to_owned());
+    }
+}
diff --git a/src/download.rs b/src/download.rs
index 09f0d5f..50eb13f 100644
--- a/src/download.rs
+++ b/src/download.rs
@@ -1,8 +1,9 @@
 use flate2::read::GzDecoder;
 use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
-use reqwest::{header, Client, Url};
+use regex::Regex;
+use reqwest::{Client, Url, header};
 use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
-use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware};
+use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff};
 use std::{
     fs::File,
     io::{BufRead, BufReader},
@@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError>
         .build())
 }
 
-pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
+pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> {
+    let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
+
+    // Check if the snapshot is a news snapshot and reformat it
+    // The format of the main crawl urls is different from the news crawl urls
+    // https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
+    // https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz
+    let snapshot_original_ref = options.snapshot.clone();
+    if news_re.is_match(&options.snapshot) {
+        let caps = news_re.captures(&options.snapshot).unwrap();
+        options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]);
+    }
     let paths = format!(
         "{}crawl-data/{}/{}.paths.gz",
         BASE_URL, options.snapshot, options.data_type
@@ -94,15 +106,15 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
         status if status.is_success() => (),
         status if status.is_client_error() => {
             return Err(format!(
-                "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET:{}\n\tULR: {}\n\nDoesn't seem to exist or it is not accessible.\n\tError Code: {:?}",
-                options.snapshot, options.data_type, url, status
+                "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError Code: {} {}",
+                snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("")
             )
             .into());
         }
         _ => {
             return Err(
                 format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
-            )
+            );
         }
     }
 

From 05ce83a58203f3525a0c4fdc088cc6c5384967ab Mon Sep 17 00:00:00 2001
From: Pedro Ortiz Suarez <pedro@commoncrawl.org>
Date: Thu, 6 Mar 2025 15:29:35 +0100
Subject: [PATCH 3/7] docs: removed unused comment from code

---
 src/cli.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index c70c274..19a795f 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -91,8 +91,6 @@ impl DataType {
     }
 }
 
-//https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
-
 fn crawl_name_format(crawl: &str) -> Result<String, String> {
     let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
     let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();

From b7ed9e96169a460a6237495c49c7c4d780c4315e Mon Sep 17 00:00:00 2001
From: Pedro Ortiz Suarez <pedro@commoncrawl.org>
Date: Thu, 6 Mar 2025 16:54:35 +0100
Subject: [PATCH 4/7] fix: Added custom error message only for the 404 case and
 a method to automatically fix the casing of the crawl reference

---
 src/cli.rs      |  8 +++++---
 src/download.rs | 16 ++++++++++------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index 19a795f..b8cbf5b 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -95,9 +95,11 @@ fn crawl_name_format(crawl: &str) -> Result<String, String> {
     let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
     let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
 
-    if !(main_re.is_match(crawl) || news_re.is_match(crawl)) {
-        return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format, make sure your input is propely capitalized".to_string());
+    let crawl_ref = crawl.to_uppercase();
+
+    if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) {
+        return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string());
     } else {
-        return Ok(crawl.to_owned());
+        return Ok(crawl_ref);
     }
 }
diff --git a/src/download.rs b/src/download.rs
index 50eb13f..e4903ca 100644
--- a/src/download.rs
+++ b/src/download.rs
@@ -104,17 +104,21 @@ pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), Down
     let resp = client.head(url.as_str()).send().await?;
     match resp.status() {
         status if status.is_success() => (),
-        status if status.is_client_error() => {
+        status if status.as_u16() == 404 => {
             return Err(format!(
-                "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError Code: {} {}",
+                "\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError code: {} {}",
                 snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("")
             )
             .into());
         }
-        _ => {
-            return Err(
-                format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
-            );
+        status => {
+            return Err(format!(
+                "Couldn't download URL: {}. Error code: {} {}",
+                url,
+                status.as_str(),
+                status.canonical_reason().unwrap_or("")
+            )
+            .into());
         }
     }
 

From 83b5445647810037b06bbf65a3821ef3ce8b7af1 Mon Sep 17 00:00:00 2001
From: Pedro Ortiz Suarez <pedro@commoncrawl.org>
Date: Thu, 6 Mar 2025 17:08:55 +0100
Subject: [PATCH 5/7] fix: fix 2 linter warnings that were not caught on the
 last commit

---
 src/cli.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index b8cbf5b..38d5dad 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -98,8 +98,8 @@ fn crawl_name_format(crawl: &str) -> Result<String, String> {
     let crawl_ref = crawl.to_uppercase();
 
     if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) {
-        return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string());
+        Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string())
     } else {
-        return Ok(crawl_ref);
+        Ok(crawl_ref)
     }
 }

From 19d17b4474471ada3bc54939336cb0266f4ad020 Mon Sep 17 00:00:00 2001
From: Pedro Ortiz Suarez <pedro@commoncrawl.org>
Date: Fri, 7 Mar 2025 09:54:10 +0100
Subject: [PATCH 6/7] chore: Bumped the version number in the Cargo.toml and
 the SECURITY.md files and updated the README.md in order to prepare the next
 release

---
 Cargo.toml  | 2 +-
 README.md   | 2 +-
 SECURITY.md | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 808f33b..63ecdda 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cc-downloader"
-version = "0.5.2"
+version = "0.6.0"
 edition = "2024"
 authors = ["Pedro Ortiz Suarez <pedro@commoncrawl.org>"]
 description = "A polite and user-friendly downloader for Common Crawl data."
diff --git a/README.md b/README.md
index 6b657d2..7d653d5 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ Download paths for a given crawl
 Usage: cc-downloader download-paths <CRAWL> <SUBSET> <DESTINATION>
 
 Arguments:
-  <CRAWL>        Crawl reference, e.g. CC-MAIN-2021-04
+  <CRAWL>        Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
   <SUBSET>       Data type [possible values: segment, warc, wat, wet, robotstxt, non200responses, cc-index, cc-index-table]
   <DESTINATION>  Destination folder
 
diff --git a/SECURITY.md b/SECURITY.md
index 6480dce..29c9414 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -6,8 +6,8 @@ Only the latest minor version is being supported
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 0.5.x   | :white_check_mark: |
-| < 0.5.0 | :x:                |
+| 0.6.x   | :white_check_mark: |
+| < 0.6.0 | :x:                |
 
 ## Reporting a Vulnerability
 

From b19e4bf9f79f295daaf4284d83ccd55ffab66abd Mon Sep 17 00:00:00 2001
From: Pedro Ortiz Suarez <pedro@commoncrawl.org>
Date: Thu, 13 Mar 2025 23:32:18 +0100
Subject: [PATCH 7/7] fix: bump versions of some dependencies, and resolve the
 problem with the reqwest deprecated API

TODO: We need to monitor the the open PRs in reqwest-middleware and bump the version of it here as soon as they are merged
---
 Cargo.toml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 63ecdda..6739dc8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,24 +12,24 @@ repository = "https://github.com/commoncrawl/cc-downloader"
 documentation = "https://docs.rs/cc-downloader"
 
 [dependencies]
-clap = { version = "4.5.31", features = ["derive"] }
+clap = { version = "4.5.32", features = ["derive"] }
 flate2 = "1.1.0"
 futures = "0.3.31"
 indicatif = "0.17.11"
 regex = "1.11.1"
-reqwest = { version = "0.12.12", default-features = false, features = [
+reqwest = { version = "0.12.14", default-features = false, features = [
     "stream",
     "rustls-tls",
 ] }
 reqwest-middleware = "0.4.1"
 reqwest-retry = "0.7.0"
-tokio = { version = "1.43.0", features = ["full"] }
-tokio-util = { version = "0.7.13", features = ["compat"] }
+tokio = { version = "1.44.1", features = ["full"] }
+tokio-util = { version = "0.7.14", features = ["compat"] }
 url = "2.5.4"
 
 [dev-dependencies]
-serde = { version = "1.0.218", features = ["derive"] }
-reqwest = { version = "0.12.12", default-features = false, features = [
+serde = { version = "1.0.219", features = ["derive"] }
+reqwest = { version = "0.12.14", default-features = false, features = [
     "stream",
     "rustls-tls",
     "json",