Skip to content

Commit 801fb77

Browse files
committed
Perf: Refactored a huge function into smaller subprocesses
1 parent e9849e6 commit 801fb77

File tree

2 files changed

+112
-84
lines changed

2 files changed

+112
-84
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 38 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
require_relative 'wayback_machine_downloader/to_regex'
1616
require_relative 'wayback_machine_downloader/archive_api'
1717
require_relative 'wayback_machine_downloader/subdom_processor'
18+
require_relative 'wayback_machine_downloader/url_rewrite'
1819

1920
class ConnectionPool
2021
MAX_AGE = 300
@@ -474,6 +475,39 @@ def append_to_db(file_id)
474475
end
475476
end
476477

478+
def processing_files(pool, files_to_process)
479+
files_to_process.each do |file_remote_info|
480+
pool.post do
481+
download_success = false
482+
begin
483+
@connection_pool.with_connection do |connection|
484+
result_message = download_file(file_remote_info, connection)
485+
# assume download success if the result message contains ' -> '
486+
if result_message && result_message.include?(' -> ')
487+
download_success = true
488+
end
489+
@download_mutex.synchronize do
490+
@processed_file_count += 1
491+
# adjust progress message to reflect remaining files
492+
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
493+
puts progress_message if progress_message
494+
end
495+
end
496+
# sppend to DB only after successful download outside the connection block
497+
if download_success
498+
append_to_db(file_remote_info[:file_id])
499+
end
500+
rescue => e
501+
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
502+
@download_mutex.synchronize do
503+
@processed_file_count += 1
504+
end
505+
end
506+
sleep(RATE_LIMIT)
507+
end
508+
end
509+
end
510+
477511
def download_files
478512
start_time = Time.now
479513
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -520,36 +554,7 @@ def download_files
520554
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
521555
pool = Concurrent::FixedThreadPool.new(thread_count)
522556

523-
files_to_process.each do |file_remote_info|
524-
pool.post do
525-
download_success = false
526-
begin
527-
@connection_pool.with_connection do |connection|
528-
result_message = download_file(file_remote_info, connection)
529-
# assume download success if the result message contains ' -> '
530-
if result_message && result_message.include?(' -> ')
531-
download_success = true
532-
end
533-
@download_mutex.synchronize do
534-
@processed_file_count += 1
535-
# adjust progress message to reflect remaining files
536-
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
537-
puts progress_message if progress_message
538-
end
539-
end
540-
# sppend to DB only after successful download outside the connection block
541-
if download_success
542-
append_to_db(file_remote_info[:file_id])
543-
end
544-
rescue => e
545-
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
546-
@download_mutex.synchronize do
547-
@processed_file_count += 1
548-
end
549-
end
550-
sleep(RATE_LIMIT)
551-
end
552-
end
557+
processing_files(pool, files_to_process)
553558

554559
pool.shutdown
555560
pool.wait_for_termination
@@ -609,64 +614,13 @@ def rewrite_urls_to_relative(file_path)
609614
end
610615

611616
# URLs in HTML attributes
612-
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
613-
prefix, url, suffix = $1, $2, $3
614-
615-
if url.start_with?('http')
616-
begin
617-
uri = URI.parse(url)
618-
path = uri.path
619-
path = path[1..-1] if path.start_with?('/')
620-
"#{prefix}#{path}#{suffix}"
621-
rescue
622-
"#{prefix}#{url}#{suffix}"
623-
end
624-
elsif url.start_with?('/')
625-
"#{prefix}./#{url[1..-1]}#{suffix}"
626-
else
627-
"#{prefix}#{url}#{suffix}"
628-
end
629-
end
617+
rewrite_html_attr_urls(content)
630618

631619
# URLs in CSS
632-
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
633-
url = $1
634-
635-
if url.start_with?('http')
636-
begin
637-
uri = URI.parse(url)
638-
path = uri.path
639-
path = path[1..-1] if path.start_with?('/')
640-
"url(\"#{path}\")"
641-
rescue
642-
"url(\"#{url}\")"
643-
end
644-
elsif url.start_with?('/')
645-
"url(\"./#{url[1..-1]}\")"
646-
else
647-
"url(\"#{url}\")"
648-
end
649-
end
620+
rewrite_css_urls(content)
650621

651622
# URLs in JavaScript
652-
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
653-
quote_start, url, quote_end = $1, $2, $3
654-
655-
if url.start_with?('http')
656-
begin
657-
uri = URI.parse(url)
658-
path = uri.path
659-
path = path[1..-1] if path.start_with?('/')
660-
"#{quote_start}#{path}#{quote_end}"
661-
rescue
662-
"#{quote_start}#{url}#{quote_end}"
663-
end
664-
elsif url.start_with?('/')
665-
"#{quote_start}./#{url[1..-1]}#{quote_end}"
666-
else
667-
"#{quote_start}#{url}#{quote_end}"
668-
end
669-
end
623+
rewrite_js_urls(content)
670624

671625
# for URLs in HTML attributes that start with a single slash
672626
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# frozen_string_literal: true
2+
3+
# URLs in HTML attributes
4+
def rewrite_html_attr_urls(content)
5+
6+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
7+
prefix, url, suffix = $1, $2, $3
8+
9+
if url.start_with?('http')
10+
begin
11+
uri = URI.parse(url)
12+
path = uri.path
13+
path = path[1..-1] if path.start_with?('/')
14+
"#{prefix}#{path}#{suffix}"
15+
rescue
16+
"#{prefix}#{url}#{suffix}"
17+
end
18+
elsif url.start_with?('/')
19+
"#{prefix}./#{url[1..-1]}#{suffix}"
20+
else
21+
"#{prefix}#{url}#{suffix}"
22+
end
23+
end
24+
content
25+
end
26+
27+
# URLs in CSS
28+
def rewrite_css_urls(content)
29+
30+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
31+
url = $1
32+
33+
if url.start_with?('http')
34+
begin
35+
uri = URI.parse(url)
36+
path = uri.path
37+
path = path[1..-1] if path.start_with?('/')
38+
"url(\"#{path}\")"
39+
rescue
40+
"url(\"#{url}\")"
41+
end
42+
elsif url.start_with?('/')
43+
"url(\"./#{url[1..-1]}\")"
44+
else
45+
"url(\"#{url}\")"
46+
end
47+
end
48+
content
49+
end
50+
51+
# URLs in JavaScript
52+
def rewrite_js_urls(content)
53+
54+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
55+
quote_start, url, quote_end = $1, $2, $3
56+
57+
if url.start_with?('http')
58+
begin
59+
uri = URI.parse(url)
60+
path = uri.path
61+
path = path[1..-1] if path.start_with?('/')
62+
"#{quote_start}#{path}#{quote_end}"
63+
rescue
64+
"#{quote_start}#{url}#{quote_end}"
65+
end
66+
elsif url.start_with?('/')
67+
"#{quote_start}./#{url[1..-1]}#{quote_end}"
68+
else
69+
"#{quote_start}#{url}#{quote_end}"
70+
end
71+
end
72+
73+
content
74+
end

0 commit comments

Comments
 (0)