Skip to content

Commit 9b6166f

Browse files
committed
refactor: Refactored the download code so that it passes all linter checks
This commit also prepares the code to be used as a library and to make bindings from it.
1 parent f2603fc commit 9b6166f

File tree

2 files changed

+107
-69
lines changed

2 files changed

+107
-69
lines changed

src/download.rs

Lines changed: 90 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,64 @@ use flate2::read::GzDecoder;
22
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
33
use reqwest::{header, Client, Url};
44
use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
5-
use reqwest_retry::policies::ExponentialBackoff;
6-
use reqwest_retry::Jitter;
7-
use reqwest_retry::RetryTransientMiddleware;
8-
use std::fs::File;
9-
use std::io::BufRead;
10-
use std::io::BufReader;
11-
use std::path::{Path, PathBuf};
12-
use std::process;
13-
use std::sync::Arc;
14-
use std::time::Duration;
15-
use tokio::io::AsyncWriteExt;
16-
use tokio::io::BufWriter;
17-
use tokio::sync::Semaphore;
18-
use tokio::task::JoinSet;
5+
use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware};
6+
use std::{
7+
fs::File,
8+
io::{BufRead, BufReader},
9+
path::{Path, PathBuf},
10+
process, str,
11+
sync::Arc,
12+
time::Duration,
13+
};
14+
use tokio::{
15+
io::{AsyncWriteExt, BufWriter},
16+
sync::Semaphore,
17+
task::JoinSet,
18+
};
1919

2020
use crate::errors::DownloadError;
2121

2222
const BASE_URL: &str = "https://data.commoncrawl.org/";
2323

2424
static APP_USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),);
2525

26+
pub struct DownloadOptions<'a> {
27+
pub snapshot: String,
28+
pub data_type: &'a str,
29+
pub paths: &'a Path,
30+
pub dst: &'a Path,
31+
pub threads: usize,
32+
pub max_retries: usize,
33+
pub numbered: bool,
34+
pub files_only: bool,
35+
pub progress: bool,
36+
}
37+
38+
pub struct TaskOptions {
39+
pub number: usize,
40+
pub path: String,
41+
pub dst: PathBuf,
42+
pub numbered: bool,
43+
pub files_only: bool,
44+
pub progress: bool,
45+
}
46+
47+
impl Default for DownloadOptions<'_> {
48+
fn default() -> Self {
49+
DownloadOptions {
50+
snapshot: "".to_string(),
51+
data_type: "",
52+
paths: Path::new(""),
53+
dst: Path::new(""),
54+
threads: 1,
55+
max_retries: 1000,
56+
numbered: false,
57+
files_only: false,
58+
progress: false,
59+
}
60+
}
61+
}
62+
2663
fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError> {
2764
let retry_policy = ExponentialBackoff::builder()
2865
.retry_bounds(Duration::from_secs(1), Duration::from_secs(3600))
@@ -37,16 +74,15 @@ fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError>
3774
.build())
3875
}
3976

40-
pub async fn download_paths(
41-
snapshot: &String,
42-
data_type: &str,
43-
dst: &Path,
44-
) -> Result<(), DownloadError> {
45-
let paths = format!("{}crawl-data/{}/{}.paths.gz", BASE_URL, snapshot, data_type);
77+
pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
78+
let paths = format!(
79+
"{}crawl-data/{}/{}.paths.gz",
80+
BASE_URL, options.snapshot, options.data_type
81+
);
4682
println!("Downloading paths from: {}", paths);
4783
let url = Url::parse(&paths)?;
4884

49-
let client = new_client(1000)?;
85+
let client = new_client(options.max_retries)?;
5086

5187
let filename = url
5288
.path_segments() // Splits into segments of the URL
@@ -55,7 +91,7 @@ pub async fn download_paths(
5591

5692
let request = client.get(url.as_str());
5793

58-
let mut dst = dst.to_path_buf();
94+
let mut dst = options.dst.to_path_buf();
5995

6096
dst.push(filename);
6197

@@ -79,16 +115,11 @@ pub async fn download_paths(
79115

80116
async fn download_task(
81117
client: ClientWithMiddleware,
82-
download_url: String,
83-
number: usize,
84118
multibar: Arc<MultiProgress>,
85-
dst: PathBuf,
86-
numbered: bool,
87-
files_only: bool,
88-
progress: bool,
119+
task_options: TaskOptions,
89120
) -> Result<(), DownloadError> {
90121
// Parse URL into Url type
91-
let url = Url::parse(&download_url)?;
122+
let url = Url::parse(&task_options.path)?;
92123

93124
// We need to determine the file size before we download, so we can create a ProgressBar
94125
// A Header request for the CONTENT_LENGTH header gets us the file size
@@ -109,17 +140,17 @@ async fn download_task(
109140
};
110141

111142
// Parse the filename from the given URL
112-
let filename = if numbered {
113-
&format!("{}{}", number, ".txt.gz")
114-
} else if files_only {
143+
let filename = if task_options.numbered {
144+
&format!("{}{}", task_options.number, ".txt.gz")
145+
} else if task_options.files_only {
115146
url.path_segments()
116147
.and_then(|segments| segments.last())
117148
.unwrap_or("file.download")
118149
} else {
119150
url.path().strip_prefix("/").unwrap_or("file.download")
120151
};
121152

122-
let mut dst = dst.clone();
153+
let mut dst = task_options.dst.clone();
123154

124155
dst.push(filename);
125156

@@ -130,7 +161,7 @@ async fn download_task(
130161
// and add it to the multibar
131162
let progress_bar = multibar.add(ProgressBar::new(download_size));
132163

133-
if progress {
164+
if task_options.progress {
134165
// Set Style to the ProgressBar
135166
progress_bar.set_style(
136167
ProgressStyle::default_bar()
@@ -145,7 +176,7 @@ async fn download_task(
145176
}
146177

147178
// Create the directory if it doesn't exist
148-
if !numbered {
179+
if !task_options.numbered {
149180
if let Some(parent) = dst.parent() {
150181
tokio::fs::create_dir_all(parent).await?;
151182
}
@@ -163,13 +194,13 @@ async fn download_task(
163194
// We use the part from the reqwest-tokio example here on purpose
164195
// This way, we are able to increase the ProgressBar with every downloaded chunk
165196
while let Some(chunk) = download.chunk().await? {
166-
if progress {
197+
if task_options.progress {
167198
progress_bar.inc(chunk.len() as u64); // Increase ProgressBar by chunk size
168199
}
169200
outfile.write_all(&chunk).await?; // Write chunk to output file
170201
}
171202

172-
if progress {
203+
if task_options.progress {
173204
// Finish the progress bar to prevent glitches
174205
progress_bar.finish();
175206

@@ -187,22 +218,18 @@ async fn download_task(
187218
Ok(())
188219
}
189220

190-
pub async fn download(
191-
paths: &Path,
192-
dst: &Path,
193-
threads: usize,
194-
max_retries: usize,
195-
numbered: bool,
196-
files_only: bool,
197-
progress: bool,
198-
) -> Result<(), DownloadError> {
221+
pub async fn download(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
199222
// A vector containing all the URLs to download
200223

201224
let file = {
202-
let gzip_file = match File::open(paths) {
225+
let gzip_file = match File::open(options.paths) {
203226
Ok(file) => file,
204227
Err(e) => {
205-
eprintln!("Could not open file {}\nError: {}", paths.display(), e);
228+
eprintln!(
229+
"Could not open file {}\nError: {}",
230+
options.paths.display(),
231+
e
232+
);
206233
process::exit(1)
207234
}
208235
};
@@ -232,7 +259,7 @@ pub async fn download(
232259
);
233260

234261
// Only set the style if we are showing progress
235-
if progress {
262+
if options.progress {
236263
main_pb.set_style(
237264
indicatif::ProgressStyle::default_bar().template("{msg} {bar:10} {pos}/{len}")?,
238265
);
@@ -243,25 +270,30 @@ pub async fn download(
243270
main_pb.tick();
244271
}
245272

246-
let client = new_client(max_retries)?;
273+
let client = new_client(options.max_retries)?;
247274

248-
let semaphore = Arc::new(Semaphore::new(threads));
275+
let semaphore = Arc::new(Semaphore::new(options.threads));
249276
let mut set = JoinSet::new();
250277

251278
for (number, path) in paths {
252279
// Clone multibar and main_pb. We will move the clones into each task.
253280
let multibar = multibar.clone();
254281
let main_pb = main_pb.clone();
255282
let client = client.clone();
256-
let dst = dst.to_path_buf();
283+
let dst = options.dst.to_path_buf();
257284
let semaphore = semaphore.clone();
258285
set.spawn(async move {
259286
let _permit = semaphore.acquire().await;
260-
let res = download_task(
261-
client, path, number, multibar, dst, numbered, files_only, progress,
262-
)
263-
.await;
264-
if progress {
287+
let task_options = TaskOptions {
288+
path,
289+
number,
290+
dst,
291+
numbered: options.numbered,
292+
files_only: options.files_only,
293+
progress: options.progress,
294+
};
295+
let res = download_task(client, multibar, task_options).await;
296+
if options.progress {
265297
// Increment the main progress bar.
266298
main_pb.inc(1);
267299
}
@@ -289,7 +321,7 @@ pub async fn download(
289321
}
290322
}
291323

292-
if progress {
324+
if options.progress {
293325
// Change the message on the overall progress indicator.
294326
main_pb.finish_with_message("done");
295327

src/main.rs

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ async fn main() {
1616
data_type,
1717
dst,
1818
}) => {
19-
match download::download_paths(snapshot, data_type.as_str(), dst).await {
19+
let options = download::DownloadOptions {
20+
snapshot: snapshot.to_string(),
21+
data_type: data_type.as_str(),
22+
dst,
23+
..Default::default()
24+
};
25+
match download::download_paths(options).await {
2026
Ok(_) => (),
2127
Err(e) => {
2228
eprintln!("Error downloading paths: {}", e);
@@ -35,17 +41,17 @@ async fn main() {
3541
if *numbered && *files_only {
3642
eprintln!("Numbered and Files Only flags are incompatible");
3743
} else {
38-
match download::download(
39-
path_file,
44+
let options = download::DownloadOptions {
45+
paths: path_file,
4046
dst,
41-
*threads,
42-
*retries,
43-
*numbered,
44-
*files_only,
45-
*progress,
46-
)
47-
.await
48-
{
47+
progress: *progress,
48+
threads: *threads,
49+
max_retries: *retries,
50+
numbered: *numbered,
51+
files_only: *files_only,
52+
..Default::default()
53+
};
54+
match download::download(options).await {
4955
Ok(_) => (),
5056
Err(e) => {
5157
eprintln!("Error downloading paths: {}", e);

0 commit comments

Comments
 (0)