Skip to content

Commit e28d7d5

Browse files
Experimental ability to rewrite URLs to local browsing
1 parent a7a2557 commit e28d7d5

File tree

2 files changed

+104
-1
lines changed

2 files changed

+104
-1
lines changed

bin/wayback_machine_downloader

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ option_parser = OptionParser.new do |opts|
6262
options[:rewritten] = true
6363
end
6464

65+
opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
66+
options[:rewrite] = true
67+
end
68+
6569
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
6670
options[:reset] = true
6771
end

lib/wayback_machine_downloader.rb

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ class WaybackMachineDownloader
125125

126126
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
127127
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
128-
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
128+
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
129129

130130
def initialize params
131131
validate_params(params)
@@ -148,6 +148,7 @@ def initialize params
148148
@failed_downloads = Concurrent::Array.new
149149
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
150150
@db_mutex = Mutex.new
151+
@rewrite = params[:rewrite] || false
151152

152153
handle_reset
153154
end
@@ -533,6 +534,101 @@ def structure_dir_path dir_path
533534
end
534535
end
535536

537+
def rewrite_urls_to_relative(file_path)
538+
return unless File.exist?(file_path)
539+
540+
file_ext = File.extname(file_path).downcase
541+
542+
begin
543+
content = File.binread(file_path)
544+
545+
if file_ext == '.html' || file_ext == '.htm'
546+
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
547+
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
548+
else
549+
content.force_encoding('UTF-8')
550+
end
551+
552+
# URLs in HTML attributes
553+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
554+
prefix, url, suffix = $1, $2, $3
555+
556+
if url.start_with?('http')
557+
begin
558+
uri = URI.parse(url)
559+
path = uri.path
560+
path = path[1..-1] if path.start_with?('/')
561+
"#{prefix}#{path}#{suffix}"
562+
rescue
563+
"#{prefix}#{url}#{suffix}"
564+
end
565+
elsif url.start_with?('/')
566+
"#{prefix}./#{url[1..-1]}#{suffix}"
567+
else
568+
"#{prefix}#{url}#{suffix}"
569+
end
570+
end
571+
572+
# URLs in CSS
573+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
574+
url = $1
575+
576+
if url.start_with?('http')
577+
begin
578+
uri = URI.parse(url)
579+
path = uri.path
580+
path = path[1..-1] if path.start_with?('/')
581+
"url(\"#{path}\")"
582+
rescue
583+
"url(\"#{url}\")"
584+
end
585+
elsif url.start_with?('/')
586+
"url(\"./#{url[1..-1]}\")"
587+
else
588+
"url(\"#{url}\")"
589+
end
590+
end
591+
592+
# URLs in JavaScript
593+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
594+
quote_start, url, quote_end = $1, $2, $3
595+
596+
if url.start_with?('http')
597+
begin
598+
uri = URI.parse(url)
599+
path = uri.path
600+
path = path[1..-1] if path.start_with?('/')
601+
"#{quote_start}#{path}#{quote_end}"
602+
rescue
603+
"#{quote_start}#{url}#{quote_end}"
604+
end
605+
elsif url.start_with?('/')
606+
"#{quote_start}./#{url[1..-1]}#{quote_end}"
607+
else
608+
"#{quote_start}#{url}#{quote_end}"
609+
end
610+
end
611+
612+
# for URLs in HTML attributes that start with a single slash
613+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
614+
prefix, path, suffix = $1, $2, $3
615+
"#{prefix}./#{path}#{suffix}"
616+
end
617+
618+
# for URLs in CSS that start with a single slash
619+
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
620+
path = $1
621+
"url(\"./#{path}\")"
622+
end
623+
624+
# save the modified content back to the file
625+
File.binwrite(file_path, content)
626+
puts "Rewrote URLs in #{file_path} to be relative."
627+
rescue Errno::ENOENT => e
628+
@logger.warn("Error reading file #{file_path}: #{e.message}")
629+
end
630+
end
631+
536632
def download_file (file_remote_info, http)
537633
current_encoding = "".encoding
538634
file_url = file_remote_info[:file_url].encode(current_encoding)
@@ -564,6 +660,9 @@ def download_file (file_remote_info, http)
564660
begin
565661
structure_dir_path dir_path
566662
download_with_retry(file_path, file_url, file_timestamp, http)
663+
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
664+
rewrite_urls_to_relative(file_path)
665+
end
567666
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
568667
rescue StandardError => e
569668
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"

0 commit comments

Comments
 (0)