Skip to content

CC-NEWS support and validation for crawl reference #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,33 +1,34 @@
[package]
name = "cc-downloader"
version = "0.5.2"
edition = "2021"
edition = "2024"
authors = ["Pedro Ortiz Suarez <pedro@commoncrawl.org>"]
description = "A polite and user-friendly downloader for Common Crawl data."
license = "MIT OR Apache-2.0"
rust-version = "1.83"
rust-version = "1.85"
readme = "README.md"
homepage = "https://commoncrawl.org"
repository = "https://github.com/commoncrawl/cc-downloader"
documentation = "https://docs.rs/cc-downloader"

[dependencies]
clap = { version = "4.5.29", features = ["derive"] }
flate2 = "1.0.35"
clap = { version = "4.5.31", features = ["derive"] }
flate2 = "1.1.0"
futures = "0.3.31"
indicatif = "0.17.11"
regex = "1.11.1"
reqwest = { version = "0.12.12", default-features = false, features = [
"stream",
"rustls-tls",
] }
reqwest-middleware = "0.4.0"
reqwest-middleware = "0.4.1"
reqwest-retry = "0.7.0"
tokio = { version = "1.43.0", features = ["full"] }
tokio-util = { version = "0.7.13", features = ["compat"] }
url = "2.5.4"

[dev-dependencies]
serde = { version = "1.0.217", features = ["derive"] }
serde = { version = "1.0.218", features = ["derive"] }
reqwest = { version = "0.12.12", default-features = false, features = [
"stream",
"rustls-tls",
Expand Down
18 changes: 16 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::path::PathBuf;

use clap::{Parser, Subcommand, ValueEnum};
use regex::Regex;

#[derive(Parser)]
#[command(version, about, long_about = None)]
Expand All @@ -13,8 +14,8 @@ pub struct Cli {
pub enum Commands {
/// Download paths for a given crawl
DownloadPaths {
/// Crawl reference, e.g. CC-MAIN-2021-04
#[arg(value_name = "CRAWL")]
/// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
#[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
snapshot: String,

/// Data type
Expand Down Expand Up @@ -89,3 +90,16 @@ impl DataType {
}
}
}

fn crawl_name_format(crawl: &str) -> Result<String, String> {
let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();

let crawl_ref = crawl.to_uppercase();

if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) {
Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string())
} else {
Ok(crawl_ref)
}
}
41 changes: 37 additions & 4 deletions src/download.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use flate2::read::GzDecoder;
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use reqwest::{header, Client, Url};
use regex::Regex;
use reqwest::{Client, Url, header};
use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware};
use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff};
use std::{
fs::File,
io::{BufRead, BufReader},
Expand Down Expand Up @@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError>
.build())
}

pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> {
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();

// Check if the snapshot is a news snapshot and reformat it
// The format of the main crawl urls is different from the news crawl urls
// https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
// https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz
let snapshot_original_ref = options.snapshot.clone();
if news_re.is_match(&options.snapshot) {
let caps = news_re.captures(&options.snapshot).unwrap();
options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]);
}
let paths = format!(
"{}crawl-data/{}/{}.paths.gz",
BASE_URL, options.snapshot, options.data_type
Expand All @@ -89,6 +101,27 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
.and_then(|segments| segments.last()) // Retrieves the last segment
.unwrap_or("file.download"); // Fallback to generic filename

let resp = client.head(url.as_str()).send().await?;
match resp.status() {
status if status.is_success() => (),
status if status.as_u16() == 404 => {
return Err(format!(
"\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError code: {} {}",
snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("")
)
.into());
}
status => {
return Err(format!(
"Couldn't download URL: {}. Error code: {} {}",
url,
status.as_str(),
status.canonical_reason().unwrap_or("")
)
.into());
}
}

let request = client.get(url.as_str());

let mut dst = options.dst.to_path_buf();
Expand Down Expand Up @@ -134,7 +167,7 @@ async fn download_task(
} else {
// We return an Error if something goes wrong here
return Err(
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status(),).into(),
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
);
}
};
Expand Down