Skip to content

Commit 0387855

Browse files
Ability to recursively download across subdomains
this is quite experimental. Fixes #15 but still needs more testing
1 parent 2eead8c commit 0387855

File tree

3 files changed

+260
-0
lines changed

3 files changed

+260
-0
lines changed

bin/wayback_machine_downloader

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,14 @@ option_parser = OptionParser.new do |opts|
7474
options[:keep] = true
7575
end
7676

77+
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
78+
options[:recursive_subdomains] = true
79+
end
80+
81+
opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
82+
options[:subdomain_depth] = t
83+
end
84+
7785
opts.on("-v", "--version", "Display version") do |t|
7886
options[:version] = t
7987
end

lib/wayback_machine_downloader.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
require_relative 'wayback_machine_downloader/tidy_bytes'
1515
require_relative 'wayback_machine_downloader/to_regex'
1616
require_relative 'wayback_machine_downloader/archive_api'
17+
require_relative 'wayback_machine_downloader/subdom_processor'
1718

1819
class ConnectionPool
1920
MAX_AGE = 300
@@ -112,6 +113,7 @@ def cleanup_old_connections
112113
class WaybackMachineDownloader
113114

114115
include ArchiveAPI
116+
include SubdomainProcessor
115117

116118
VERSION = "2.3.10"
117119
DEFAULT_TIMEOUT = 30
@@ -153,6 +155,8 @@ def initialize params
153155
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
154156
@db_mutex = Mutex.new
155157
@rewrite = params[:rewrite] || false
158+
@recursive_subdomains = params[:recursive_subdomains] || false
159+
@subdomain_depth = params[:subdomain_depth] || 1
156160

157161
handle_reset
158162
end
@@ -513,6 +517,16 @@ def download_files
513517

514518
end_time = Time.now
515519
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
520+
521+
# process subdomains if enabled
522+
if @recursive_subdomains
523+
subdomain_start_time = Time.now
524+
process_subdomains
525+
subdomain_end_time = Time.now
526+
subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
527+
puts "Subdomain processing finished in #{subdomain_time}s."
528+
end
529+
516530
puts "Results saved in #{backup_path}"
517531
cleanup
518532
end
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
# frozen_string_literal: true
2+
3+
module SubdomainProcessor
4+
def process_subdomains
5+
return unless @recursive_subdomains
6+
7+
puts "Starting subdomain processing..."
8+
9+
# extract base domain from the URL for comparison
10+
base_domain = extract_base_domain(@base_url)
11+
@processed_domains = Set.new([base_domain])
12+
@subdomain_queue = Queue.new
13+
14+
# scan downloaded files for subdomain links
15+
initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
16+
puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
17+
18+
subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
19+
20+
if subdomains_found.empty?
21+
puts "No subdomains found in downloaded content."
22+
return
23+
end
24+
25+
puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
26+
27+
# add found subdomains to the queue
28+
subdomains_found.each do |subdomain|
29+
full_domain = "#{subdomain}.#{base_domain}"
30+
@subdomain_queue << "https://#{full_domain}/"
31+
end
32+
33+
# process the subdomain queue
34+
download_subdomains(base_domain)
35+
36+
# after all downloads, rewrite all URLs to make local references
37+
rewrite_subdomain_links(base_domain) if @rewrite
38+
end
39+
40+
private
41+
42+
def extract_base_domain(url)
43+
uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
44+
return nil unless uri
45+
46+
host = uri.host || uri.path.split('/').first
47+
host = host.downcase
48+
49+
# extract the base domain (e.g., "example.com" from "sub.example.com")
50+
parts = host.split('.')
51+
return host if parts.size <= 2
52+
53+
# for domains like co.uk, we want to keep the last 3 parts
54+
if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
55+
parts.last(3).join('.')
56+
else
57+
parts.last(2).join('.')
58+
end
59+
end
60+
61+
def scan_files_for_subdomains(files, base_domain)
62+
return [] unless base_domain
63+
64+
subdomains = Set.new
65+
66+
files.each do |file_path|
67+
next unless File.exist?(file_path)
68+
69+
begin
70+
content = File.read(file_path)
71+
72+
# extract URLs from HTML href/src attributes
73+
content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
74+
subdomain = match[0].downcase
75+
next if subdomain == 'www' # skip www subdomain
76+
subdomains.add(subdomain)
77+
end
78+
79+
# extract URLs from CSS
80+
content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
81+
subdomain = match[0].downcase
82+
next if subdomain == 'www' # skip www subdomain
83+
subdomains.add(subdomain)
84+
end
85+
86+
# extract URLs from JavaScript strings
87+
content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
88+
subdomain = match[0].downcase
89+
next if subdomain == 'www' # skip www subdomain
90+
subdomains.add(subdomain)
91+
end
92+
rescue => e
93+
puts "Error scanning file #{file_path}: #{e.message}"
94+
end
95+
end
96+
97+
subdomains.to_a
98+
end
99+
100+
def download_subdomains(base_domain)
101+
puts "Starting subdomain downloads..."
102+
depth = 0
103+
max_depth = @subdomain_depth || 1
104+
105+
while depth < max_depth && !@subdomain_queue.empty?
106+
current_batch = []
107+
108+
# get all subdomains at current depth
109+
while !@subdomain_queue.empty?
110+
current_batch << @subdomain_queue.pop
111+
end
112+
113+
puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
114+
115+
# download each subdomain
116+
current_batch.each do |subdomain_url|
117+
download_subdomain(subdomain_url, base_domain)
118+
end
119+
120+
# if we need to go deeper, scan the newly downloaded files
121+
if depth + 1 < max_depth
122+
# get all files in the subdomains directory
123+
new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
124+
new_subdomains = scan_files_for_subdomains(new_files, base_domain)
125+
126+
# filter out already processed subdomains
127+
new_subdomains.each do |subdomain|
128+
full_domain = "#{subdomain}.#{base_domain}"
129+
unless @processed_domains.include?(full_domain)
130+
@processed_domains.add(full_domain)
131+
@subdomain_queue << "https://#{full_domain}/"
132+
end
133+
end
134+
135+
puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
136+
end
137+
138+
depth += 1
139+
end
140+
end
141+
142+
def download_subdomain(subdomain_url, base_domain)
143+
begin
144+
uri = URI.parse(subdomain_url)
145+
subdomain_host = uri.host
146+
147+
# skip if already processed
148+
if @processed_domains.include?(subdomain_host)
149+
puts "Skipping already processed subdomain: #{subdomain_host}"
150+
return
151+
end
152+
153+
@processed_domains.add(subdomain_host)
154+
puts "Downloading subdomain: #{subdomain_url}"
155+
156+
# create the directory for this subdomain
157+
subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
158+
FileUtils.mkdir_p(subdomain_dir)
159+
160+
# create subdomain downloader with appropriate options
161+
subdomain_options = {
162+
base_url: subdomain_url,
163+
directory: subdomain_dir,
164+
from_timestamp: @from_timestamp,
165+
to_timestamp: @to_timestamp,
166+
all: @all,
167+
threads_count: @threads_count,
168+
maximum_pages: [@maximum_pages / 2, 10].max,
169+
rewrite: @rewrite,
170+
# don't recursively process subdomains from here
171+
recursive_subdomains: false
172+
}
173+
174+
# download the subdomain content
175+
subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
176+
subdomain_downloader.download_files
177+
178+
puts "Completed download of subdomain: #{subdomain_host}"
179+
rescue => e
180+
puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
181+
end
182+
end
183+
184+
def rewrite_subdomain_links(base_domain)
185+
puts "Rewriting all files to use local subdomain references..."
186+
187+
all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
188+
subdomains = @processed_domains.reject { |domain| domain == base_domain }
189+
190+
puts "Found #{all_files.size} files to check for rewriting"
191+
puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
192+
193+
rewritten_count = 0
194+
195+
all_files.each do |file_path|
196+
next unless File.exist?(file_path)
197+
198+
begin
199+
content = File.read(file_path)
200+
original_content = content.dup
201+
202+
# replace subdomain URLs with local paths
203+
subdomains.each do |subdomain_host|
204+
# for HTML attributes (href, src, etc.)
205+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
206+
prefix, path, suffix = $1, $2, $3
207+
path = "/index.html" if path.empty? || path == "/"
208+
"#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
209+
end
210+
211+
# for CSS url()
212+
content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
213+
path = $1
214+
path = "/index.html" if path.empty? || path == "/"
215+
"url(\"../subdomains/#{subdomain_host}#{path}\")"
216+
end
217+
218+
# for JavaScript strings
219+
content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
220+
quote_start, path, quote_end = $1, $2, $3
221+
path = "/index.html" if path.empty? || path == "/"
222+
"#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
223+
end
224+
end
225+
226+
# save if modified
227+
if content != original_content
228+
File.write(file_path, content)
229+
rewritten_count += 1
230+
end
231+
rescue => e
232+
puts "Error rewriting file #{file_path}: #{e.message}"
233+
end
234+
end
235+
236+
puts "Rewrote links in #{rewritten_count} files"
237+
end
238+
end

0 commit comments

Comments
 (0)