Skip to content

Commit fd329af

Browse files
Merge pull request #20 from underarchiver/rfc3968-url-validity-check
Prevent fetching off non RFC3968-compliant URLs
2 parents 0387855 + f03d92a commit fd329af

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ class WaybackMachineDownloader
125125
STATE_CDX_FILENAME = ".cdx.json"
126126
STATE_DB_FILENAME = ".downloaded.txt"
127127

128+
128129
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
129130
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
130131
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
@@ -158,6 +159,9 @@ def initialize params
158159
@recursive_subdomains = params[:recursive_subdomains] || false
159160
@subdomain_depth = params[:subdomain_depth] || 1
160161

162+
# URL for rejecting invalid/unencoded wayback urls
163+
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
164+
161165
handle_reset
162166
end
163167

@@ -754,6 +758,12 @@ def download_with_retry(file_path, file_url, file_timestamp, connection, redirec
754758
# Escape square brackets because they are not valid in URI()
755759
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
756760

761+
# reject invalid/unencoded wayback_url, behaving as if the resource weren't found
762+
if not @url_regexp.match?(wayback_url)
763+
@logger.warn("Skipped #{file_url}: invalid URL")
764+
return :skipped_not_found
765+
end
766+
757767
request = Net::HTTP::Get.new(URI(wayback_url))
758768
request["Connection"] = "keep-alive"
759769
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"

0 commit comments

Comments
 (0)