Skip to content

Commit aed4ae6

Browse files
committed
feat: Adds explicit support for CC-NEWS, adds more details and fixes the 4XX error message when downloading paths, adds validation to the cli input for the crawl reference
1 parent 0e38844 commit aed4ae6

File tree

3 files changed

+35
-8
lines changed

3 files changed

+35
-8
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ clap = { version = "4.5.31", features = ["derive"] }
1616
flate2 = "1.1.0"
1717
futures = "0.3.31"
1818
indicatif = "0.17.11"
19+
regex = "1.11.1"
1920
reqwest = { version = "0.12.12", default-features = false, features = [
2021
"stream",
2122
"rustls-tls",

src/cli.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use std::path::PathBuf;
22

33
use clap::{Parser, Subcommand, ValueEnum};
4+
use regex::Regex;
45

56
#[derive(Parser)]
67
#[command(version, about, long_about = None)]
@@ -13,8 +14,8 @@ pub struct Cli {
1314
pub enum Commands {
1415
/// Download paths for a given crawl
1516
DownloadPaths {
16-
/// Crawl reference, e.g. CC-MAIN-2021-04
17-
#[arg(value_name = "CRAWL")]
17+
/// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
18+
#[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
1819
snapshot: String,
1920

2021
/// Data type
@@ -89,3 +90,16 @@ impl DataType {
8990
}
9091
}
9192
}
93+
94+
//https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
95+
96+
fn crawl_name_format(crawl: &str) -> Result<String, String> {
97+
let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
98+
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
99+
100+
if !(main_re.is_match(crawl) || news_re.is_match(crawl)) {
101+
return Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format, make sure your input is propely capitalized".to_string());
102+
} else {
103+
return Ok(crawl.to_owned());
104+
}
105+
}

src/download.rs

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
use flate2::read::GzDecoder;
22
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
3-
use reqwest::{header, Client, Url};
3+
use regex::Regex;
4+
use reqwest::{Client, Url, header};
45
use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
5-
use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware};
6+
use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff};
67
use std::{
78
fs::File,
89
io::{BufRead, BufReader},
@@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError>
7475
.build())
7576
}
7677

77-
pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
78+
pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> {
79+
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
80+
81+
// Check if the snapshot is a news snapshot and reformat it
82+
// The format of the main crawl urls is different from the news crawl urls
83+
// https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
84+
// https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz
85+
let snapshot_original_ref = options.snapshot.clone();
86+
if news_re.is_match(&options.snapshot) {
87+
let caps = news_re.captures(&options.snapshot).unwrap();
88+
options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]);
89+
}
7890
let paths = format!(
7991
"{}crawl-data/{}/{}.paths.gz",
8092
BASE_URL, options.snapshot, options.data_type
@@ -94,15 +106,15 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
94106
status if status.is_success() => (),
95107
status if status.is_client_error() => {
96108
return Err(format!(
97-
"\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET:{}\n\tULR: {}\n\nDoesn't seem to exist or it is not accessible.\n\tError Code: {:?}",
98-
options.snapshot, options.data_type, url, status
109+
"\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError Code: {} {}",
110+
snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("")
99111
)
100112
.into());
101113
}
102114
_ => {
103115
return Err(
104116
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
105-
)
117+
);
106118
}
107119
}
108120

0 commit comments

Comments
 (0)