|
15 | 15 | require_relative 'wayback_machine_downloader/to_regex'
|
16 | 16 | require_relative 'wayback_machine_downloader/archive_api'
|
17 | 17 | require_relative 'wayback_machine_downloader/subdom_processor'
|
| 18 | +require_relative 'wayback_machine_downloader/url_rewrite' |
18 | 19 |
|
19 | 20 | class ConnectionPool
|
20 | 21 | MAX_AGE = 300
|
@@ -474,6 +475,39 @@ def append_to_db(file_id)
|
474 | 475 | end
|
475 | 476 | end
|
476 | 477 |
|
| 478 | + def processing_files(pool, files_to_process) |
| 479 | + files_to_process.each do |file_remote_info| |
| 480 | + pool.post do |
| 481 | + download_success = false |
| 482 | + begin |
| 483 | + @connection_pool.with_connection do |connection| |
| 484 | + result_message = download_file(file_remote_info, connection) |
| 485 | + # assume download success if the result message contains ' -> ' |
| 486 | + if result_message && result_message.include?(' -> ') |
| 487 | + download_success = true |
| 488 | + end |
| 489 | + @download_mutex.synchronize do |
| 490 | + @processed_file_count += 1 |
| 491 | + # adjust progress message to reflect remaining files |
| 492 | + progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message |
| 493 | + puts progress_message if progress_message |
| 494 | + end |
| 495 | + end |
| 496 | + # sppend to DB only after successful download outside the connection block |
| 497 | + if download_success |
| 498 | + append_to_db(file_remote_info[:file_id]) |
| 499 | + end |
| 500 | + rescue => e |
| 501 | + @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}") |
| 502 | + @download_mutex.synchronize do |
| 503 | + @processed_file_count += 1 |
| 504 | + end |
| 505 | + end |
| 506 | + sleep(RATE_LIMIT) |
| 507 | + end |
| 508 | + end |
| 509 | + end |
| 510 | + |
477 | 511 | def download_files
|
478 | 512 | start_time = Time.now
|
479 | 513 | puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
@@ -520,36 +554,7 @@ def download_files
|
520 | 554 | thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
521 | 555 | pool = Concurrent::FixedThreadPool.new(thread_count)
|
522 | 556 |
|
523 |
| - files_to_process.each do |file_remote_info| |
524 |
| - pool.post do |
525 |
| - download_success = false |
526 |
| - begin |
527 |
| - @connection_pool.with_connection do |connection| |
528 |
| - result_message = download_file(file_remote_info, connection) |
529 |
| - # assume download success if the result message contains ' -> ' |
530 |
| - if result_message && result_message.include?(' -> ') |
531 |
| - download_success = true |
532 |
| - end |
533 |
| - @download_mutex.synchronize do |
534 |
| - @processed_file_count += 1 |
535 |
| - # adjust progress message to reflect remaining files |
536 |
| - progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message |
537 |
| - puts progress_message if progress_message |
538 |
| - end |
539 |
| - end |
540 |
| - # sppend to DB only after successful download outside the connection block |
541 |
| - if download_success |
542 |
| - append_to_db(file_remote_info[:file_id]) |
543 |
| - end |
544 |
| - rescue => e |
545 |
| - @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}") |
546 |
| - @download_mutex.synchronize do |
547 |
| - @processed_file_count += 1 |
548 |
| - end |
549 |
| - end |
550 |
| - sleep(RATE_LIMIT) |
551 |
| - end |
552 |
| - end |
| 557 | + processing_files(pool, files_to_process) |
553 | 558 |
|
554 | 559 | pool.shutdown
|
555 | 560 | pool.wait_for_termination
|
@@ -609,64 +614,13 @@ def rewrite_urls_to_relative(file_path)
|
609 | 614 | end
|
610 | 615 |
|
611 | 616 | # URLs in HTML attributes
|
612 |
| - content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do |
613 |
| - prefix, url, suffix = $1, $2, $3 |
614 |
| - |
615 |
| - if url.start_with?('http') |
616 |
| - begin |
617 |
| - uri = URI.parse(url) |
618 |
| - path = uri.path |
619 |
| - path = path[1..-1] if path.start_with?('/') |
620 |
| - "#{prefix}#{path}#{suffix}" |
621 |
| - rescue |
622 |
| - "#{prefix}#{url}#{suffix}" |
623 |
| - end |
624 |
| - elsif url.start_with?('/') |
625 |
| - "#{prefix}./#{url[1..-1]}#{suffix}" |
626 |
| - else |
627 |
| - "#{prefix}#{url}#{suffix}" |
628 |
| - end |
629 |
| - end |
| 617 | + rewrite_html_attr_urls(content) |
630 | 618 |
|
631 | 619 | # URLs in CSS
|
632 |
| - content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do |
633 |
| - url = $1 |
634 |
| - |
635 |
| - if url.start_with?('http') |
636 |
| - begin |
637 |
| - uri = URI.parse(url) |
638 |
| - path = uri.path |
639 |
| - path = path[1..-1] if path.start_with?('/') |
640 |
| - "url(\"#{path}\")" |
641 |
| - rescue |
642 |
| - "url(\"#{url}\")" |
643 |
| - end |
644 |
| - elsif url.start_with?('/') |
645 |
| - "url(\"./#{url[1..-1]}\")" |
646 |
| - else |
647 |
| - "url(\"#{url}\")" |
648 |
| - end |
649 |
| - end |
| 620 | + rewrite_css_urls(content) |
650 | 621 |
|
651 | 622 | # URLs in JavaScript
|
652 |
| - content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do |
653 |
| - quote_start, url, quote_end = $1, $2, $3 |
654 |
| - |
655 |
| - if url.start_with?('http') |
656 |
| - begin |
657 |
| - uri = URI.parse(url) |
658 |
| - path = uri.path |
659 |
| - path = path[1..-1] if path.start_with?('/') |
660 |
| - "#{quote_start}#{path}#{quote_end}" |
661 |
| - rescue |
662 |
| - "#{quote_start}#{url}#{quote_end}" |
663 |
| - end |
664 |
| - elsif url.start_with?('/') |
665 |
| - "#{quote_start}./#{url[1..-1]}#{quote_end}" |
666 |
| - else |
667 |
| - "#{quote_start}#{url}#{quote_end}" |
668 |
| - end |
669 |
| - end |
| 623 | + rewrite_js_urls(content) |
670 | 624 |
|
671 | 625 | # for URLs in HTML attributes that start with a single slash
|
672 | 626 | content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
|
0 commit comments