@@ -125,6 +125,7 @@ class WaybackMachineDownloader
125
125
STATE_CDX_FILENAME = ".cdx.json"
126
126
STATE_DB_FILENAME = ".downloaded.txt"
127
127
128
+
128
129
attr_accessor :base_url , :exact_url , :directory , :all_timestamps ,
129
130
:from_timestamp , :to_timestamp , :only_filter , :exclude_filter ,
130
131
:all , :maximum_pages , :threads_count , :logger , :reset , :keep , :rewrite
@@ -158,6 +159,9 @@ def initialize params
158
159
@recursive_subdomains = params [ :recursive_subdomains ] || false
159
160
@subdomain_depth = params [ :subdomain_depth ] || 1
160
161
162
+ # URL for rejecting invalid/unencoded wayback urls
163
+ @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/ \/ (((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/ ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/ (((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/ ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/ ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\? ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/ |\? )*)?(\# ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/ |\? )*)?)$/
164
+
161
165
handle_reset
162
166
end
163
167
@@ -754,6 +758,12 @@ def download_with_retry(file_path, file_url, file_timestamp, connection, redirec
754
758
# Escape square brackets because they are not valid in URI()
755
759
wayback_url = wayback_url . gsub ( '[' , '%5B' ) . gsub ( ']' , '%5D' )
756
760
761
+ # reject invalid/unencoded wayback_url, behaving as if the resource weren't found
762
+ if not @url_regexp . match? ( wayback_url )
763
+ @logger . warn ( "Skipped #{ file_url } : invalid URL" )
764
+ return :skipped_not_found
765
+ end
766
+
757
767
request = Net ::HTTP ::Get . new ( URI ( wayback_url ) )
758
768
request [ "Connection" ] = "keep-alive"
759
769
request [ "User-Agent" ] = "WaybackMachineDownloader/#{ VERSION } "
0 commit comments