@@ -477,8 +477,8 @@ def download_files
477
477
begin
478
478
@connection_pool . with_connection do |connection |
479
479
result_message = download_file ( file_remote_info , connection )
480
- # for now, assume success if no exception and message doesn't indicate error/skip
481
- if result_message && ! result_message . downcase . include? ( 'error' ) && ! result_message . downcase . include? ( 'failed' ) && ! result_message . downcase . include? ( 'skipping' ) && ! result_message . downcase . include? ( 'already exists ')
480
+ # assume download success if the result message contains ' -> '
481
+ if result_message && result_message . include? ( ' -> ' )
482
482
download_success = true
483
483
end
484
484
@download_mutex . synchronize do
@@ -659,11 +659,21 @@ def download_file (file_remote_info, http)
659
659
660
660
begin
661
661
structure_dir_path dir_path
662
- download_with_retry ( file_path , file_url , file_timestamp , http )
663
- if @rewrite && File . extname ( file_path ) =~ /\. (html?|css|js)$/i
664
- rewrite_urls_to_relative ( file_path )
662
+ status = download_with_retry ( file_path , file_url , file_timestamp , http )
663
+
664
+ case status
665
+ when :saved
666
+ if @rewrite && File . extname ( file_path ) =~ /\. (html?|css|js)$/i
667
+ rewrite_urls_to_relative ( file_path )
668
+ end
669
+ "#{ file_url } -> #{ file_path } (#{ @processed_file_count + 1 } /#{ @total_to_download } )"
670
+ when :skipped_not_found
671
+ "Skipped (not found): #{ file_url } (#{ @processed_file_count + 1 } /#{ @total_to_download } )"
672
+ else
673
+ # ideally, this case should not be reached if download_with_retry behaves as expected.
674
+ @logger . warn ( "Unknown status from download_with_retry for #{ file_url } : #{ status } " )
675
+ "Unknown status for #{ file_url } : #{ status } (#{ @processed_file_count + 1 } /#{ @total_to_download } )"
665
676
end
666
- "#{ file_url } -> #{ file_path } (#{ @processed_file_count + 1 } /#{ @total_to_download } )"
667
677
rescue StandardError => e
668
678
msg = "Failed: #{ file_url } # #{ e } (#{ @processed_file_count + 1 } /#{ @total_to_download } )"
669
679
if File . exist? ( file_path ) and File . size ( file_path ) == 0
@@ -714,8 +724,7 @@ def download_with_retry(file_path, file_url, file_timestamp, connection, redirec
714
724
715
725
response = connection . request ( request )
716
726
717
- case response
718
- when Net ::HTTPSuccess
727
+ save_response_body = lambda do
719
728
File . open ( file_path , "wb" ) do |file |
720
729
body = response . body
721
730
if response [ 'content-encoding' ] == 'gzip' && body && !body . empty?
@@ -725,26 +734,48 @@ def download_with_retry(file_path, file_url, file_timestamp, connection, redirec
725
734
gz . close
726
735
file . write ( decompressed_body )
727
736
rescue Zlib ::GzipFile ::Error => e
728
- @logger . warn ( "Failure decompressing gzip file #{ file_url } : #{ e . message } " )
737
+ @logger . warn ( "Failure decompressing gzip file #{ file_url } : #{ e . message } . Writing raw body. " )
729
738
file . write ( body )
730
739
end
731
740
else
732
741
file . write ( body ) if body
733
742
end
734
743
end
735
- when Net ::HTTPRedirection
736
- raise "Too many redirects for #{ file_url } " if redirect_count >= 2
737
- location = response [ 'location' ]
738
- @logger . warn ( "Redirect found for #{ file_url } -> #{ location } " )
739
- return download_with_retry ( file_path , location , file_timestamp , connection , redirect_count + 1 )
740
- when Net ::HTTPTooManyRequests
741
- sleep ( RATE_LIMIT * 2 )
742
- raise "Rate limited, retrying..."
743
- when Net ::HTTPNotFound
744
- @logger . warn ( "File not found, skipping: #{ file_url } " )
745
- return
746
- else
747
- raise "HTTP Error: #{ response . code } #{ response . message } "
744
+ end
745
+
746
+ if @all
747
+ case response
748
+ when Net ::HTTPSuccess , Net ::HTTPRedirection , Net ::HTTPClientError , Net ::HTTPServerError
749
+ save_response_body . call
750
+ if response . is_a? ( Net ::HTTPRedirection )
751
+ @logger . info ( "Saved redirect page for #{ file_url } (status #{ response . code } )." )
752
+ elsif response . is_a? ( Net ::HTTPClientError ) || response . is_a? ( Net ::HTTPServerError )
753
+ @logger . info ( "Saved error page for #{ file_url } (status #{ response . code } )." )
754
+ end
755
+ return :saved
756
+ else
757
+ # for any other response type when --all is true, treat as an error to be retried or failed
758
+ raise "Unhandled HTTP response: #{ response . code } #{ response . message } "
759
+ end
760
+ else # not @all (our default behavior)
761
+ case response
762
+ when Net ::HTTPSuccess
763
+ save_response_body . call
764
+ return :saved
765
+ when Net ::HTTPRedirection
766
+ raise "Too many redirects for #{ file_url } " if redirect_count >= 2
767
+ location = response [ 'location' ]
768
+ @logger . warn ( "Redirect found for #{ file_url } -> #{ location } " )
769
+ return download_with_retry ( file_path , location , file_timestamp , connection , redirect_count + 1 )
770
+ when Net ::HTTPTooManyRequests
771
+ sleep ( RATE_LIMIT * 2 )
772
+ raise "Rate limited, retrying..."
773
+ when Net ::HTTPNotFound
774
+ @logger . warn ( "File not found, skipping: #{ file_url } " )
775
+ return :skipped_not_found
776
+ else
777
+ raise "HTTP Error: #{ response . code } #{ response . message } "
778
+ end
748
779
end
749
780
750
781
rescue StandardError => e
0 commit comments