Skip to content

Commit 0e38844

Browse files
committed
fix: This commit fixes the error where users request a crawl that doesn't exist and cc-downloader downloaded the body of the response
Now this action will produce an error
1 parent 35db859 commit 0e38844

File tree

2 files changed

+24
-7
lines changed

2 files changed

+24
-7
lines changed

Cargo.toml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,33 @@
11
[package]
22
name = "cc-downloader"
33
version = "0.5.2"
4-
edition = "2021"
4+
edition = "2024"
55
authors = ["Pedro Ortiz Suarez <pedro@commoncrawl.org>"]
66
description = "A polite and user-friendly downloader for Common Crawl data."
77
license = "MIT OR Apache-2.0"
8-
rust-version = "1.83"
8+
rust-version = "1.85"
99
readme = "README.md"
1010
homepage = "https://commoncrawl.org"
1111
repository = "https://github.com/commoncrawl/cc-downloader"
1212
documentation = "https://docs.rs/cc-downloader"
1313

1414
[dependencies]
15-
clap = { version = "4.5.29", features = ["derive"] }
16-
flate2 = "1.0.35"
15+
clap = { version = "4.5.31", features = ["derive"] }
16+
flate2 = "1.1.0"
1717
futures = "0.3.31"
1818
indicatif = "0.17.11"
1919
reqwest = { version = "0.12.12", default-features = false, features = [
2020
"stream",
2121
"rustls-tls",
2222
] }
23-
reqwest-middleware = "0.4.0"
23+
reqwest-middleware = "0.4.1"
2424
reqwest-retry = "0.7.0"
2525
tokio = { version = "1.43.0", features = ["full"] }
2626
tokio-util = { version = "0.7.13", features = ["compat"] }
2727
url = "2.5.4"
2828

2929
[dev-dependencies]
30-
serde = { version = "1.0.217", features = ["derive"] }
30+
serde = { version = "1.0.218", features = ["derive"] }
3131
reqwest = { version = "0.12.12", default-features = false, features = [
3232
"stream",
3333
"rustls-tls",

src/download.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,23 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
8989
.and_then(|segments| segments.last()) // Retrieves the last segment
9090
.unwrap_or("file.download"); // Fallback to generic filename
9191

92+
let resp = client.head(url.as_str()).send().await?;
93+
match resp.status() {
94+
status if status.is_success() => (),
95+
status if status.is_client_error() => {
96+
return Err(format!(
97+
"\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET:{}\n\tULR: {}\n\nDoesn't seem to exist or it is not accessible.\n\tError Code: {:?}",
98+
options.snapshot, options.data_type, url, status
99+
)
100+
.into());
101+
}
102+
_ => {
103+
return Err(
104+
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
105+
)
106+
}
107+
}
108+
92109
let request = client.get(url.as_str());
93110

94111
let mut dst = options.dst.to_path_buf();
@@ -134,7 +151,7 @@ async fn download_task(
134151
} else {
135152
// We return an Error if something goes wrong here
136153
return Err(
137-
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status(),).into(),
154+
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
138155
);
139156
}
140157
};

0 commit comments

Comments
 (0)