Skip to content

Commit 6bc0894

Browse files
More aggressive sanitization
this should deal with some of the issues we've seen, luckily. What a ride!
1 parent c731e0c commit 6bc0894

File tree

1 file changed

+80
-7
lines changed

1 file changed

+80
-7
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
require 'logger'
1212
require 'zlib'
1313
require 'stringio'
14+
require 'digest'
1415
require_relative 'wayback_machine_downloader/tidy_bytes'
1516
require_relative 'wayback_machine_downloader/to_regex'
1617
require_relative 'wayback_machine_downloader/archive_api'
@@ -171,12 +172,19 @@ def initialize params
171172

172173
def backup_name
173174
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
174-
175-
if url_to_process.include? '//'
175+
raw = if url_to_process.include?('//')
176176
url_to_process.split('/')[2]
177177
else
178178
url_to_process
179179
end
180+
181+
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
182+
if Gem.win_platform?
183+
raw = raw.gsub(/[:*?"<>|]/, '_')
184+
raw = raw.gsub(/[ .]+\z/, '')
185+
end
186+
raw = 'site' if raw.nil? || raw.empty?
187+
raw
180188
end
181189

182190
def backup_path
@@ -768,18 +776,83 @@ def setup_logger
768776

769777
# safely sanitize a file id (or id+timestamp)
770778
def sanitize_and_prepare_id(raw, file_url)
771-
return nil if raw.nil?
779+
return nil if raw.nil? || raw.empty?
780+
original = raw.dup
772781
begin
773-
raw = CGI.unescape(raw) rescue raw
774-
raw.gsub!(/<[^>]*>/, '')
775-
raw = raw.tidy_bytes unless raw.empty?
782+
# work on a binary copy to avoid premature encoding errors
783+
raw = raw.dup.force_encoding(Encoding::BINARY)
784+
785+
# percent-decode (repeat until stable in case of double-encoding)
786+
loop do
787+
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
788+
break if decoded == raw
789+
raw = decoded
790+
end
791+
792+
# try tidy_bytes
793+
begin
794+
raw = raw.tidy_bytes
795+
rescue StandardError
796+
# fallback: scrub to UTF-8
797+
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
798+
end
799+
800+
# ensure UTF-8 and scrub again
801+
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
802+
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
803+
end
804+
805+
# strip HTML/comment artifacts & control chars
806+
raw.gsub!(/<!--+/, '')
807+
raw.gsub!(/[\x00-\x1F]/, '')
808+
809+
# split query; hash it for stable short name
810+
path_part, query_part = raw.split('?', 2)
811+
if query_part && !query_part.empty?
812+
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
813+
if path_part.include?('.')
814+
pre, _sep, post = path_part.rpartition('.')
815+
path_part = "#{pre}__q#{q_digest}.#{post}"
816+
else
817+
path_part = "#{path_part}__q#{q_digest}"
818+
end
819+
end
820+
raw = path_part
821+
822+
# collapse slashes & trim leading slash
823+
raw.gsub!(%r{/+}, '/')
824+
raw.sub!(%r{\A/}, '')
825+
826+
# segment-wise sanitation
827+
raw = raw.split('/').map do |segment|
828+
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
829+
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
830+
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
831+
seg.empty? ? '_' : seg
832+
end.join('/')
833+
834+
# remove any remaining angle brackets
835+
raw.tr!('<>', '')
836+
837+
# final fallback if empty
838+
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
839+
776840
raw
777841
rescue => e
778842
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
779-
nil
843+
# deterministic fallback – never return nil so caller won’t mark malformed
844+
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
780845
end
781846
end
782847

848+
# wrap URL in parentheses if it contains characters that commonly break unquoted
849+
# Windows CMD usage (e.g., &). This is only for display; user still must quote
850+
# when invoking manually.
851+
def safe_display_url(url)
852+
return url unless url && url.match?(/[&]/)
853+
"(#{url})"
854+
end
855+
783856
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
784857
retries = 0
785858
begin

0 commit comments

Comments
 (0)