|
11 | 11 | require 'logger'
|
12 | 12 | require 'zlib'
|
13 | 13 | require 'stringio'
|
| 14 | +require 'digest' |
14 | 15 | require_relative 'wayback_machine_downloader/tidy_bytes'
|
15 | 16 | require_relative 'wayback_machine_downloader/to_regex'
|
16 | 17 | require_relative 'wayback_machine_downloader/archive_api'
|
@@ -171,12 +172,19 @@ def initialize params
|
171 | 172 |
|
172 | 173 | def backup_name
|
173 | 174 | url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
174 |
| - |
175 |
| - if url_to_process.include? '//' |
| 175 | + raw = if url_to_process.include?('//') |
176 | 176 | url_to_process.split('/')[2]
|
177 | 177 | else
|
178 | 178 | url_to_process
|
179 | 179 | end
|
| 180 | + |
| 181 | + # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port) |
| 182 | + if Gem.win_platform? |
| 183 | + raw = raw.gsub(/[:*?"<>|]/, '_') |
| 184 | + raw = raw.gsub(/[ .]+\z/, '') |
| 185 | + end |
| 186 | + raw = 'site' if raw.nil? || raw.empty? |
| 187 | + raw |
180 | 188 | end
|
181 | 189 |
|
182 | 190 | def backup_path
|
@@ -768,18 +776,83 @@ def setup_logger
|
768 | 776 |
|
769 | 777 | # safely sanitize a file id (or id+timestamp)
|
770 | 778 | def sanitize_and_prepare_id(raw, file_url)
|
771 |
| - return nil if raw.nil? |
| 779 | + return nil if raw.nil? || raw.empty? |
| 780 | + original = raw.dup |
772 | 781 | begin
|
773 |
| - raw = CGI.unescape(raw) rescue raw |
774 |
| - raw.gsub!(/<[^>]*>/, '') |
775 |
| - raw = raw.tidy_bytes unless raw.empty? |
| 782 | + # work on a binary copy to avoid premature encoding errors |
| 783 | + raw = raw.dup.force_encoding(Encoding::BINARY) |
| 784 | + |
| 785 | + # percent-decode (repeat until stable in case of double-encoding) |
| 786 | + loop do |
| 787 | + decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') } |
| 788 | + break if decoded == raw |
| 789 | + raw = decoded |
| 790 | + end |
| 791 | + |
| 792 | + # try tidy_bytes |
| 793 | + begin |
| 794 | + raw = raw.tidy_bytes |
| 795 | + rescue StandardError |
| 796 | + # fallback: scrub to UTF-8 |
| 797 | + raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '') |
| 798 | + end |
| 799 | + |
| 800 | + # ensure UTF-8 and scrub again |
| 801 | + unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding? |
| 802 | + raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '') |
| 803 | + end |
| 804 | + |
| 805 | + # strip HTML/comment artifacts & control chars |
| 806 | + raw.gsub!(/<!--+/, '') |
| 807 | + raw.gsub!(/[\x00-\x1F]/, '') |
| 808 | + |
| 809 | + # split query; hash it for stable short name |
| 810 | + path_part, query_part = raw.split('?', 2) |
| 811 | + if query_part && !query_part.empty? |
| 812 | + q_digest = Digest::SHA256.hexdigest(query_part)[0, 12] |
| 813 | + if path_part.include?('.') |
| 814 | + pre, _sep, post = path_part.rpartition('.') |
| 815 | + path_part = "#{pre}__q#{q_digest}.#{post}" |
| 816 | + else |
| 817 | + path_part = "#{path_part}__q#{q_digest}" |
| 818 | + end |
| 819 | + end |
| 820 | + raw = path_part |
| 821 | + |
| 822 | + # collapse slashes & trim leading slash |
| 823 | + raw.gsub!(%r{/+}, '/') |
| 824 | + raw.sub!(%r{\A/}, '') |
| 825 | + |
| 826 | + # segment-wise sanitation |
| 827 | + raw = raw.split('/').map do |segment| |
| 828 | + seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '') |
| 829 | + seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" } |
| 830 | + seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform? |
| 831 | + seg.empty? ? '_' : seg |
| 832 | + end.join('/') |
| 833 | + |
| 834 | + # remove any remaining angle brackets |
| 835 | + raw.tr!('<>', '') |
| 836 | + |
| 837 | + # final fallback if empty |
| 838 | + raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty? |
| 839 | + |
776 | 840 | raw
|
777 | 841 | rescue => e
|
778 | 842 | @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
779 |
| - nil |
| 843 | + # deterministic fallback – never return nil so caller won’t mark malformed |
| 844 | + "file__#{Digest::SHA1.hexdigest(original)[0,10]}" |
780 | 845 | end
|
781 | 846 | end
|
782 | 847 |
|
| 848 | + # wrap URL in parentheses if it contains characters that commonly break unquoted |
| 849 | + # Windows CMD usage (e.g., &). This is only for display; user still must quote |
| 850 | + # when invoking manually. |
| 851 | + def safe_display_url(url) |
| 852 | + return url unless url && url.match?(/[&]/) |
| 853 | + "(#{url})" |
| 854 | + end |
| 855 | + |
783 | 856 | def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
784 | 857 | retries = 0
|
785 | 858 | begin
|
|
0 commit comments