Skip to content

Commit 183ed61

Browse files
Attempt at fixing --all
I honestly don't recall if this was implemented in the original code, and I'm guessing this worked at *some point* during this fork. It seems to work correctly now, however. See #6 and #11
1 parent e6ecf32 commit 183ed61

File tree

1 file changed

+53
-22
lines changed

1 file changed

+53
-22
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 53 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -477,8 +477,8 @@ def download_files
477477
begin
478478
@connection_pool.with_connection do |connection|
479479
result_message = download_file(file_remote_info, connection)
480-
# for now, assume success if no exception and message doesn't indicate error/skip
481-
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
480+
# assume download success if the result message contains ' -> '
481+
if result_message && result_message.include?(' -> ')
482482
download_success = true
483483
end
484484
@download_mutex.synchronize do
@@ -659,11 +659,21 @@ def download_file (file_remote_info, http)
659659

660660
begin
661661
structure_dir_path dir_path
662-
download_with_retry(file_path, file_url, file_timestamp, http)
663-
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
664-
rewrite_urls_to_relative(file_path)
662+
status = download_with_retry(file_path, file_url, file_timestamp, http)
663+
664+
case status
665+
when :saved
666+
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
667+
rewrite_urls_to_relative(file_path)
668+
end
669+
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
670+
when :skipped_not_found
671+
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
672+
else
673+
# ideally, this case should not be reached if download_with_retry behaves as expected.
674+
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
675+
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
665676
end
666-
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
667677
rescue StandardError => e
668678
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
669679
if File.exist?(file_path) and File.size(file_path) == 0
@@ -714,8 +724,7 @@ def download_with_retry(file_path, file_url, file_timestamp, connection, redirec
714724

715725
response = connection.request(request)
716726

717-
case response
718-
when Net::HTTPSuccess
727+
save_response_body = lambda do
719728
File.open(file_path, "wb") do |file|
720729
body = response.body
721730
if response['content-encoding'] == 'gzip' && body && !body.empty?
@@ -725,26 +734,48 @@ def download_with_retry(file_path, file_url, file_timestamp, connection, redirec
725734
gz.close
726735
file.write(decompressed_body)
727736
rescue Zlib::GzipFile::Error => e
728-
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
737+
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
729738
file.write(body)
730739
end
731740
else
732741
file.write(body) if body
733742
end
734743
end
735-
when Net::HTTPRedirection
736-
raise "Too many redirects for #{file_url}" if redirect_count >= 2
737-
location = response['location']
738-
@logger.warn("Redirect found for #{file_url} -> #{location}")
739-
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
740-
when Net::HTTPTooManyRequests
741-
sleep(RATE_LIMIT * 2)
742-
raise "Rate limited, retrying..."
743-
when Net::HTTPNotFound
744-
@logger.warn("File not found, skipping: #{file_url}")
745-
return
746-
else
747-
raise "HTTP Error: #{response.code} #{response.message}"
744+
end
745+
746+
if @all
747+
case response
748+
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
749+
save_response_body.call
750+
if response.is_a?(Net::HTTPRedirection)
751+
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
752+
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
753+
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
754+
end
755+
return :saved
756+
else
757+
# for any other response type when --all is true, treat as an error to be retried or failed
758+
raise "Unhandled HTTP response: #{response.code} #{response.message}"
759+
end
760+
else # not @all (our default behavior)
761+
case response
762+
when Net::HTTPSuccess
763+
save_response_body.call
764+
return :saved
765+
when Net::HTTPRedirection
766+
raise "Too many redirects for #{file_url}" if redirect_count >= 2
767+
location = response['location']
768+
@logger.warn("Redirect found for #{file_url} -> #{location}")
769+
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
770+
when Net::HTTPTooManyRequests
771+
sleep(RATE_LIMIT * 2)
772+
raise "Rate limited, retrying..."
773+
when Net::HTTPNotFound
774+
@logger.warn("File not found, skipping: #{file_url}")
775+
return :skipped_not_found
776+
else
777+
raise "HTTP Error: #{response.code} #{response.message}"
778+
end
748779
end
749780

750781
rescue StandardError => e

0 commit comments

Comments
 (0)