Skip to content

Commit 6ad312f

Browse files
Sanitizing HTML tags
some sites contain tags *in* their URL, and fail to save on some devices like Windows
1 parent 62ea35d commit 6ad312f

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ def get_composite_snapshot_file_list(target_timestamp)
342342
next if file_timestamp.to_i > target_timestamp
343343
file_id = file_url.split('/')[3..-1].join('/')
344344
file_id = CGI::unescape file_id
345+
file_id.gsub!(/<[^>]*>/, '') # sanitize HTML tags
345346
file_id = file_id.tidy_bytes unless file_id == ""
346347
next if file_id.nil?
347348
next if match_exclude_filter(file_url)
@@ -370,9 +371,12 @@ def get_file_list_curated
370371
next unless file_url.include?('/')
371372
file_id = file_url.split('/')[3..-1].join('/')
372373
file_id = CGI::unescape file_id
374+
file_id.gsub!(/<[^>]*>/, '') # sanitize HTML tags
373375
file_id = file_id.tidy_bytes unless file_id == ""
374376
if file_id.nil?
375377
puts "Malformed file url, ignoring: #{file_url}"
378+
elsif file_id.include?('<') || file_id.include?('>')
379+
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
376380
else
377381
if match_exclude_filter(file_url)
378382
puts "File url matches exclude filter, ignoring: #{file_url}"
@@ -397,9 +401,12 @@ def get_file_list_all_timestamps
397401
file_id = file_url.split('/')[3..-1].join('/')
398402
file_id_and_timestamp = [file_timestamp, file_id].join('/')
399403
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
404+
file_id_and_timestamp.gsub!(/<[^>]*>/, '') # sanitize HTML tags
400405
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
401406
if file_id.nil?
402407
puts "Malformed file url, ignoring: #{file_url}"
408+
elsif file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
409+
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
403410
else
404411
if match_exclude_filter(file_url)
405412
puts "File url matches exclude filter, ignoring: #{file_url}"

0 commit comments

Comments
 (0)