@@ -125,7 +125,7 @@ class WaybackMachineDownloader
125125
126126 attr_accessor :base_url , :exact_url , :directory , :all_timestamps ,
127127 :from_timestamp , :to_timestamp , :only_filter , :exclude_filter ,
128- :all , :maximum_pages , :threads_count , :logger , :reset , :keep
128+ :all , :maximum_pages , :threads_count , :logger , :reset , :keep , :rewrite
129129
130130 def initialize params
131131 validate_params ( params )
@@ -148,6 +148,7 @@ def initialize params
148148 @failed_downloads = Concurrent ::Array . new
149149 @connection_pool = ConnectionPool . new ( CONNECTION_POOL_SIZE )
150150 @db_mutex = Mutex . new
151+ @rewrite = params [ :rewrite ] || false
151152
152153 handle_reset
153154 end
@@ -533,6 +534,101 @@ def structure_dir_path dir_path
533534 end
534535 end
535536
537+ def rewrite_urls_to_relative ( file_path )
538+ return unless File . exist? ( file_path )
539+
540+ file_ext = File . extname ( file_path ) . downcase
541+
542+ begin
543+ content = File . binread ( file_path )
544+
545+ if file_ext == '.html' || file_ext == '.htm'
546+ encoding = content . match ( /<meta\s +charset=["']?([^"'>]+)/i ) &.captures &.first || 'UTF-8'
547+ content . force_encoding ( encoding ) rescue content . force_encoding ( 'UTF-8' )
548+ else
549+ content . force_encoding ( 'UTF-8' )
550+ end
551+
552+ # URLs in HTML attributes
553+ content . gsub! ( /(\s (?:href|src|action|data-src|data-url)=["'])https?:\/ \/ web\. archive\. org\/ web\/ [0-9]+(?:id_)?\/ ([^"']+)(["'])/i ) do
554+ prefix , url , suffix = $1, $2, $3
555+
556+ if url . start_with? ( 'http' )
557+ begin
558+ uri = URI . parse ( url )
559+ path = uri . path
560+ path = path [ 1 ..-1 ] if path . start_with? ( '/' )
561+ "#{ prefix } #{ path } #{ suffix } "
562+ rescue
563+ "#{ prefix } #{ url } #{ suffix } "
564+ end
565+ elsif url . start_with? ( '/' )
566+ "#{ prefix } ./#{ url [ 1 ..-1 ] } #{ suffix } "
567+ else
568+ "#{ prefix } #{ url } #{ suffix } "
569+ end
570+ end
571+
572+ # URLs in CSS
573+ content . gsub! ( /url\( \s *["']?https?:\/ \/ web\. archive\. org\/ web\/ [0-9]+(?:id_)?\/ ([^"'\) ]+)["']?\s *\) /i ) do
574+ url = $1
575+
576+ if url . start_with? ( 'http' )
577+ begin
578+ uri = URI . parse ( url )
579+ path = uri . path
580+ path = path [ 1 ..-1 ] if path . start_with? ( '/' )
581+ "url(\" #{ path } \" )"
582+ rescue
583+ "url(\" #{ url } \" )"
584+ end
585+ elsif url . start_with? ( '/' )
586+ "url(\" ./#{ url [ 1 ..-1 ] } \" )"
587+ else
588+ "url(\" #{ url } \" )"
589+ end
590+ end
591+
592+ # URLs in JavaScript
593+ content . gsub! ( /(["'])https?:\/ \/ web\. archive\. org\/ web\/ [0-9]+(?:id_)?\/ ([^"']+)(["'])/i ) do
594+ quote_start , url , quote_end = $1, $2, $3
595+
596+ if url . start_with? ( 'http' )
597+ begin
598+ uri = URI . parse ( url )
599+ path = uri . path
600+ path = path [ 1 ..-1 ] if path . start_with? ( '/' )
601+ "#{ quote_start } #{ path } #{ quote_end } "
602+ rescue
603+ "#{ quote_start } #{ url } #{ quote_end } "
604+ end
605+ elsif url . start_with? ( '/' )
606+ "#{ quote_start } ./#{ url [ 1 ..-1 ] } #{ quote_end } "
607+ else
608+ "#{ quote_start } #{ url } #{ quote_end } "
609+ end
610+ end
611+
612+ # for URLs in HTML attributes that start with a single slash
613+ content . gsub! ( /(\s (?:href|src|action|data-src|data-url)=["'])\/ ([^"'\/ ][^"']*)(["'])/i ) do
614+ prefix , path , suffix = $1, $2, $3
615+ "#{ prefix } ./#{ path } #{ suffix } "
616+ end
617+
618+ # for URLs in CSS that start with a single slash
619+ content . gsub! ( /url\( \s *["']?\/ ([^"'\) \/ ][^"'\) ]*?)["']?\s *\) /i ) do
620+ path = $1
621+ "url(\" ./#{ path } \" )"
622+ end
623+
624+ # save the modified content back to the file
625+ File . binwrite ( file_path , content )
626+ puts "Rewrote URLs in #{ file_path } to be relative."
627+ rescue Errno ::ENOENT => e
628+ @logger . warn ( "Error reading file #{ file_path } : #{ e . message } " )
629+ end
630+ end
631+
536632 def download_file ( file_remote_info , http )
537633 current_encoding = "" . encoding
538634 file_url = file_remote_info [ :file_url ] . encode ( current_encoding )
@@ -564,6 +660,9 @@ def download_file (file_remote_info, http)
564660 begin
565661 structure_dir_path dir_path
566662 download_with_retry ( file_path , file_url , file_timestamp , http )
663+ if @rewrite && File . extname ( file_path ) =~ /\. (html?|css|js)$/i
664+ rewrite_urls_to_relative ( file_path )
665+ end
567666 "#{ file_url } -> #{ file_path } (#{ @processed_file_count + 1 } /#{ @total_to_download } )"
568667 rescue StandardError => e
569668 msg = "Failed: #{ file_url } # #{ e } (#{ @processed_file_count + 1 } /#{ @total_to_download } )"
0 commit comments