1
+ # frozen_string_literal: true
2
+
3
+ module SubdomainProcessor
4
+ def process_subdomains
5
+ return unless @recursive_subdomains
6
+
7
+ puts "Starting subdomain processing..."
8
+
9
+ # extract base domain from the URL for comparison
10
+ base_domain = extract_base_domain ( @base_url )
11
+ @processed_domains = Set . new ( [ base_domain ] )
12
+ @subdomain_queue = Queue . new
13
+
14
+ # scan downloaded files for subdomain links
15
+ initial_files = Dir . glob ( File . join ( backup_path , "**/*.{html,htm,css,js}" ) )
16
+ puts "Scanning #{ initial_files . size } downloaded files for subdomain links..."
17
+
18
+ subdomains_found = scan_files_for_subdomains ( initial_files , base_domain )
19
+
20
+ if subdomains_found . empty?
21
+ puts "No subdomains found in downloaded content."
22
+ return
23
+ end
24
+
25
+ puts "Found #{ subdomains_found . size } subdomains to process: #{ subdomains_found . join ( ', ' ) } "
26
+
27
+ # add found subdomains to the queue
28
+ subdomains_found . each do |subdomain |
29
+ full_domain = "#{ subdomain } .#{ base_domain } "
30
+ @subdomain_queue << "https://#{ full_domain } /"
31
+ end
32
+
33
+ # process the subdomain queue
34
+ download_subdomains ( base_domain )
35
+
36
+ # after all downloads, rewrite all URLs to make local references
37
+ rewrite_subdomain_links ( base_domain ) if @rewrite
38
+ end
39
+
40
+ private
41
+
42
+ def extract_base_domain ( url )
43
+ uri = URI . parse ( url . gsub ( /^https?:\/ \/ / , '' ) . split ( '/' ) . first ) rescue nil
44
+ return nil unless uri
45
+
46
+ host = uri . host || uri . path . split ( '/' ) . first
47
+ host = host . downcase
48
+
49
+ # extract the base domain (e.g., "example.com" from "sub.example.com")
50
+ parts = host . split ( '.' )
51
+ return host if parts . size <= 2
52
+
53
+ # for domains like co.uk, we want to keep the last 3 parts
54
+ if parts [ -2 ] . length <= 3 && parts [ -1 ] . length <= 3 && parts . size > 2
55
+ parts . last ( 3 ) . join ( '.' )
56
+ else
57
+ parts . last ( 2 ) . join ( '.' )
58
+ end
59
+ end
60
+
61
+ def scan_files_for_subdomains ( files , base_domain )
62
+ return [ ] unless base_domain
63
+
64
+ subdomains = Set . new
65
+
66
+ files . each do |file_path |
67
+ next unless File . exist? ( file_path )
68
+
69
+ begin
70
+ content = File . read ( file_path )
71
+
72
+ # extract URLs from HTML href/src attributes
73
+ content . scan ( /(?:href|src|action|data-src)=["']https?:\/ \/ ([^\/ ."']+)\. #{ Regexp . escape ( base_domain ) } [\/ "]/ ) do |match |
74
+ subdomain = match [ 0 ] . downcase
75
+ next if subdomain == 'www' # skip www subdomain
76
+ subdomains . add ( subdomain )
77
+ end
78
+
79
+ # extract URLs from CSS
80
+ content . scan ( /url\( ["']?https?:\/ \/ ([^\/ ."']+)\. #{ Regexp . escape ( base_domain ) } [\/ "]/ ) do |match |
81
+ subdomain = match [ 0 ] . downcase
82
+ next if subdomain == 'www' # skip www subdomain
83
+ subdomains . add ( subdomain )
84
+ end
85
+
86
+ # extract URLs from JavaScript strings
87
+ content . scan ( /["']https?:\/ \/ ([^\/ ."']+)\. #{ Regexp . escape ( base_domain ) } [\/ "]/ ) do |match |
88
+ subdomain = match [ 0 ] . downcase
89
+ next if subdomain == 'www' # skip www subdomain
90
+ subdomains . add ( subdomain )
91
+ end
92
+ rescue => e
93
+ puts "Error scanning file #{ file_path } : #{ e . message } "
94
+ end
95
+ end
96
+
97
+ subdomains . to_a
98
+ end
99
+
100
+ def download_subdomains ( base_domain )
101
+ puts "Starting subdomain downloads..."
102
+ depth = 0
103
+ max_depth = @subdomain_depth || 1
104
+
105
+ while depth < max_depth && !@subdomain_queue . empty?
106
+ current_batch = [ ]
107
+
108
+ # get all subdomains at current depth
109
+ while !@subdomain_queue . empty?
110
+ current_batch << @subdomain_queue . pop
111
+ end
112
+
113
+ puts "Processing #{ current_batch . size } subdomains at depth #{ depth + 1 } ..."
114
+
115
+ # download each subdomain
116
+ current_batch . each do |subdomain_url |
117
+ download_subdomain ( subdomain_url , base_domain )
118
+ end
119
+
120
+ # if we need to go deeper, scan the newly downloaded files
121
+ if depth + 1 < max_depth
122
+ # get all files in the subdomains directory
123
+ new_files = Dir . glob ( File . join ( backup_path , "subdomains" , "**/*.{html,htm,css,js}" ) )
124
+ new_subdomains = scan_files_for_subdomains ( new_files , base_domain )
125
+
126
+ # filter out already processed subdomains
127
+ new_subdomains . each do |subdomain |
128
+ full_domain = "#{ subdomain } .#{ base_domain } "
129
+ unless @processed_domains . include? ( full_domain )
130
+ @processed_domains . add ( full_domain )
131
+ @subdomain_queue << "https://#{ full_domain } /"
132
+ end
133
+ end
134
+
135
+ puts "Found #{ @subdomain_queue . size } new subdomains at depth #{ depth + 1 } " if !@subdomain_queue . empty?
136
+ end
137
+
138
+ depth += 1
139
+ end
140
+ end
141
+
142
+ def download_subdomain ( subdomain_url , base_domain )
143
+ begin
144
+ uri = URI . parse ( subdomain_url )
145
+ subdomain_host = uri . host
146
+
147
+ # skip if already processed
148
+ if @processed_domains . include? ( subdomain_host )
149
+ puts "Skipping already processed subdomain: #{ subdomain_host } "
150
+ return
151
+ end
152
+
153
+ @processed_domains . add ( subdomain_host )
154
+ puts "Downloading subdomain: #{ subdomain_url } "
155
+
156
+ # create the directory for this subdomain
157
+ subdomain_dir = File . join ( backup_path , "subdomains" , subdomain_host )
158
+ FileUtils . mkdir_p ( subdomain_dir )
159
+
160
+ # create subdomain downloader with appropriate options
161
+ subdomain_options = {
162
+ base_url : subdomain_url ,
163
+ directory : subdomain_dir ,
164
+ from_timestamp : @from_timestamp ,
165
+ to_timestamp : @to_timestamp ,
166
+ all : @all ,
167
+ threads_count : @threads_count ,
168
+ maximum_pages : [ @maximum_pages / 2 , 10 ] . max ,
169
+ rewrite : @rewrite ,
170
+ # don't recursively process subdomains from here
171
+ recursive_subdomains : false
172
+ }
173
+
174
+ # download the subdomain content
175
+ subdomain_downloader = WaybackMachineDownloader . new ( subdomain_options )
176
+ subdomain_downloader . download_files
177
+
178
+ puts "Completed download of subdomain: #{ subdomain_host } "
179
+ rescue => e
180
+ puts "Error downloading subdomain #{ subdomain_url } : #{ e . message } "
181
+ end
182
+ end
183
+
184
+ def rewrite_subdomain_links ( base_domain )
185
+ puts "Rewriting all files to use local subdomain references..."
186
+
187
+ all_files = Dir . glob ( File . join ( backup_path , "**/*.{html,htm,css,js}" ) )
188
+ subdomains = @processed_domains . reject { |domain | domain == base_domain }
189
+
190
+ puts "Found #{ all_files . size } files to check for rewriting"
191
+ puts "Will rewrite links for subdomains: #{ subdomains . join ( ', ' ) } "
192
+
193
+ rewritten_count = 0
194
+
195
+ all_files . each do |file_path |
196
+ next unless File . exist? ( file_path )
197
+
198
+ begin
199
+ content = File . read ( file_path )
200
+ original_content = content . dup
201
+
202
+ # replace subdomain URLs with local paths
203
+ subdomains . each do |subdomain_host |
204
+ # for HTML attributes (href, src, etc.)
205
+ content . gsub! ( /(\s (?:href|src|action|data-src|data-url)=["'])https?:\/ \/ #{ Regexp . escape ( subdomain_host ) } ([^"']*)(["'])/i ) do
206
+ prefix , path , suffix = $1, $2, $3
207
+ path = "/index.html" if path . empty? || path == "/"
208
+ "#{ prefix } ../subdomains/#{ subdomain_host } #{ path } #{ suffix } "
209
+ end
210
+
211
+ # for CSS url()
212
+ content . gsub! ( /url\( \s *["']?https?:\/ \/ #{ Regexp . escape ( subdomain_host ) } ([^"'\) ]*?)["']?\s *\) /i ) do
213
+ path = $1
214
+ path = "/index.html" if path . empty? || path == "/"
215
+ "url(\" ../subdomains/#{ subdomain_host } #{ path } \" )"
216
+ end
217
+
218
+ # for JavaScript strings
219
+ content . gsub! ( /(["'])https?:\/ \/ #{ Regexp . escape ( subdomain_host ) } ([^"']*)(["'])/i ) do
220
+ quote_start , path , quote_end = $1, $2, $3
221
+ path = "/index.html" if path . empty? || path == "/"
222
+ "#{ quote_start } ../subdomains/#{ subdomain_host } #{ path } #{ quote_end } "
223
+ end
224
+ end
225
+
226
+ # save if modified
227
+ if content != original_content
228
+ File . write ( file_path , content )
229
+ rewritten_count += 1
230
+ end
231
+ rescue => e
232
+ puts "Error rewriting file #{ file_path } : #{ e . message } "
233
+ end
234
+ end
235
+
236
+ puts "Rewrote links in #{ rewritten_count } files"
237
+ end
238
+ end
0 commit comments