Skip to content

Commit 5fb6ff4

Browse files
authored
Merge pull request #12 from commoncrawl/dev
Add support for CC-NEWS and validation for crawl reference on the CLI interface
2 parents e2cd0e0 + b19e4bf commit 5fb6ff4

File tree

5 files changed

+68
-20
lines changed

5 files changed

+68
-20
lines changed

Cargo.toml

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,35 @@
11
[package]
22
name = "cc-downloader"
3-
version = "0.5.2"
4-
edition = "2021"
3+
version = "0.6.0"
4+
edition = "2024"
55
authors = ["Pedro Ortiz Suarez <pedro@commoncrawl.org>"]
66
description = "A polite and user-friendly downloader for Common Crawl data."
77
license = "MIT OR Apache-2.0"
8-
rust-version = "1.83"
8+
rust-version = "1.85"
99
readme = "README.md"
1010
homepage = "https://commoncrawl.org"
1111
repository = "https://github.com/commoncrawl/cc-downloader"
1212
documentation = "https://docs.rs/cc-downloader"
1313

1414
[dependencies]
15-
clap = { version = "4.5.29", features = ["derive"] }
16-
flate2 = "1.0.35"
15+
clap = { version = "4.5.32", features = ["derive"] }
16+
flate2 = "1.1.0"
1717
futures = "0.3.31"
1818
indicatif = "0.17.11"
19-
reqwest = { version = "0.12.12", default-features = false, features = [
19+
regex = "1.11.1"
20+
reqwest = { version = "0.12.14", default-features = false, features = [
2021
"stream",
2122
"rustls-tls",
2223
] }
23-
reqwest-middleware = "0.4.0"
24+
reqwest-middleware = "0.4.1"
2425
reqwest-retry = "0.7.0"
25-
tokio = { version = "1.43.0", features = ["full"] }
26-
tokio-util = { version = "0.7.13", features = ["compat"] }
26+
tokio = { version = "1.44.1", features = ["full"] }
27+
tokio-util = { version = "0.7.14", features = ["compat"] }
2728
url = "2.5.4"
2829

2930
[dev-dependencies]
30-
serde = { version = "1.0.217", features = ["derive"] }
31-
reqwest = { version = "0.12.12", default-features = false, features = [
31+
serde = { version = "1.0.219", features = ["derive"] }
32+
reqwest = { version = "0.12.14", default-features = false, features = [
3233
"stream",
3334
"rustls-tls",
3435
"json",

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ Download paths for a given crawl
4343
Usage: cc-downloader download-paths <CRAWL> <SUBSET> <DESTINATION>
4444
4545
Arguments:
46-
<CRAWL> Crawl reference, e.g. CC-MAIN-2021-04
46+
<CRAWL> Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
4747
<SUBSET> Data type [possible values: segment, warc, wat, wet, robotstxt, non200responses, cc-index, cc-index-table]
4848
<DESTINATION> Destination folder
4949

SECURITY.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ Only the latest minor version is being supported
66

77
| Version | Supported |
88
| ------- | ------------------ |
9-
| 0.5.x | :white_check_mark: |
10-
| < 0.5.0 | :x: |
9+
| 0.6.x | :white_check_mark: |
10+
| < 0.6.0 | :x: |
1111

1212
## Reporting a Vulnerability
1313

src/cli.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use std::path::PathBuf;
22

33
use clap::{Parser, Subcommand, ValueEnum};
4+
use regex::Regex;
45

56
#[derive(Parser)]
67
#[command(version, about, long_about = None)]
@@ -13,8 +14,8 @@ pub struct Cli {
1314
pub enum Commands {
1415
/// Download paths for a given crawl
1516
DownloadPaths {
16-
/// Crawl reference, e.g. CC-MAIN-2021-04
17-
#[arg(value_name = "CRAWL")]
17+
/// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
18+
#[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
1819
snapshot: String,
1920

2021
/// Data type
@@ -89,3 +90,16 @@ impl DataType {
8990
}
9091
}
9192
}
93+
94+
fn crawl_name_format(crawl: &str) -> Result<String, String> {
95+
let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
96+
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
97+
98+
let crawl_ref = crawl.to_uppercase();
99+
100+
if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) {
101+
Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string())
102+
} else {
103+
Ok(crawl_ref)
104+
}
105+
}

src/download.rs

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
use flate2::read::GzDecoder;
22
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
3-
use reqwest::{header, Client, Url};
3+
use regex::Regex;
4+
use reqwest::{Client, Url, header};
45
use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
5-
use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware};
6+
use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff};
67
use std::{
78
fs::File,
89
io::{BufRead, BufReader},
@@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError>
7475
.build())
7576
}
7677

77-
pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
78+
pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> {
79+
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
80+
81+
// Check if the snapshot is a news snapshot and reformat it
82+
// The format of the main crawl urls is different from the news crawl urls
83+
// https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
84+
// https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz
85+
let snapshot_original_ref = options.snapshot.clone();
86+
if news_re.is_match(&options.snapshot) {
87+
let caps = news_re.captures(&options.snapshot).unwrap();
88+
options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]);
89+
}
7890
let paths = format!(
7991
"{}crawl-data/{}/{}.paths.gz",
8092
BASE_URL, options.snapshot, options.data_type
@@ -89,6 +101,27 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
89101
.and_then(|segments| segments.last()) // Retrieves the last segment
90102
.unwrap_or("file.download"); // Fallback to generic filename
91103

104+
let resp = client.head(url.as_str()).send().await?;
105+
match resp.status() {
106+
status if status.is_success() => (),
107+
status if status.as_u16() == 404 => {
108+
return Err(format!(
109+
"\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError code: {} {}",
110+
snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("")
111+
)
112+
.into());
113+
}
114+
status => {
115+
return Err(format!(
116+
"Couldn't download URL: {}. Error code: {} {}",
117+
url,
118+
status.as_str(),
119+
status.canonical_reason().unwrap_or("")
120+
)
121+
.into());
122+
}
123+
}
124+
92125
let request = client.get(url.as_str());
93126

94127
let mut dst = options.dst.to_path_buf();
@@ -134,7 +167,7 @@ async fn download_task(
134167
} else {
135168
// We return an Error if something goes wrong here
136169
return Err(
137-
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status(),).into(),
170+
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
138171
);
139172
}
140173
};

0 commit comments

Comments
 (0)