@@ -128,7 +128,8 @@ class WaybackMachineDownloader
128
128
129
129
attr_accessor :base_url , :exact_url , :directory , :all_timestamps ,
130
130
:from_timestamp , :to_timestamp , :only_filter , :exclude_filter ,
131
- :all , :maximum_pages , :threads_count , :logger , :reset , :keep , :rewrite
131
+ :all , :maximum_pages , :threads_count , :logger , :reset , :keep , :rewrite ,
132
+ :snapshot_at
132
133
133
134
def initialize params
134
135
validate_params ( params )
@@ -158,6 +159,7 @@ def initialize params
158
159
@rewrite = params [ :rewrite ] || false
159
160
@recursive_subdomains = params [ :recursive_subdomains ] || false
160
161
@subdomain_depth = params [ :subdomain_depth ] || 1
162
+ @snapshot_at = params [ :snapshot_at ] ? params [ :snapshot_at ] . to_i : nil
161
163
162
164
# URL for rejecting invalid/unencoded wayback urls
163
165
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/ \/ (((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/ ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/ (((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/ ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/ ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\? ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/ |\? )*)?(\# ((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/ |\? )*)?)$/
@@ -330,6 +332,36 @@ def get_all_snapshots_to_consider
330
332
snapshot_list_to_consider
331
333
end
332
334
335
+ # Get a composite snapshot file list for a specific timestamp
336
+ def get_composite_snapshot_file_list ( target_timestamp )
337
+ file_versions = { }
338
+ get_all_snapshots_to_consider . each do |file_timestamp , file_url |
339
+ next unless file_url . include? ( '/' )
340
+ next if file_timestamp . to_i > target_timestamp
341
+ file_id = file_url . split ( '/' ) [ 3 ..-1 ] . join ( '/' )
342
+ file_id = CGI ::unescape file_id
343
+ file_id = file_id . tidy_bytes unless file_id == ""
344
+ next if file_id . nil?
345
+ next if match_exclude_filter ( file_url )
346
+ next unless match_only_filter ( file_url )
347
+ # Select the most recent version <= target_timestamp
348
+ if !file_versions [ file_id ] || file_versions [ file_id ] [ :timestamp ] . to_i < file_timestamp . to_i
349
+ file_versions [ file_id ] = { file_url : file_url , timestamp : file_timestamp , file_id : file_id }
350
+ end
351
+ end
352
+ file_versions . values
353
+ end
354
+
355
+ # Returns a list of files for the composite snapshot
356
+ def get_file_list_composite_snapshot ( target_timestamp )
357
+ file_list = get_composite_snapshot_file_list ( target_timestamp )
358
+ file_list = file_list . sort_by { |_ , v | v [ :timestamp ] . to_s } . reverse
359
+ file_list . map do |file_remote_info |
360
+ file_remote_info [ 1 ] [ :file_id ] = file_remote_info [ 0 ]
361
+ file_remote_info [ 1 ]
362
+ end
363
+ end
364
+
333
365
def get_file_list_curated
334
366
file_list_curated = Hash . new
335
367
get_all_snapshots_to_consider . each do |file_timestamp , file_url |
@@ -384,7 +416,9 @@ def get_file_list_all_timestamps
384
416
385
417
386
418
def get_file_list_by_timestamp
387
- if @all_timestamps
419
+ if @snapshot_at
420
+ @file_list_by_snapshot_at ||= get_composite_snapshot_file_list ( @snapshot_at )
421
+ elsif @all_timestamps
388
422
file_list_curated = get_file_list_all_timestamps
389
423
file_list_curated . map do |file_remote_info |
390
424
file_remote_info [ 1 ] [ :file_id ] = file_remote_info [ 0 ]
@@ -727,7 +761,22 @@ def file_queue
727
761
end
728
762
729
763
def file_list_by_timestamp
730
- @file_list_by_timestamp ||= get_file_list_by_timestamp
764
+ if @snapshot_at
765
+ @file_list_by_snapshot_at ||= get_composite_snapshot_file_list ( @snapshot_at )
766
+ elsif @all_timestamps
767
+ file_list_curated = get_file_list_all_timestamps
768
+ file_list_curated . map do |file_remote_info |
769
+ file_remote_info [ 1 ] [ :file_id ] = file_remote_info [ 0 ]
770
+ file_remote_info [ 1 ]
771
+ end
772
+ else
773
+ file_list_curated = get_file_list_curated
774
+ file_list_curated = file_list_curated . sort_by { |_ , v | v [ :timestamp ] . to_s } . reverse
775
+ file_list_curated . map do |file_remote_info |
776
+ file_remote_info [ 1 ] [ :file_id ] = file_remote_info [ 0 ]
777
+ file_remote_info [ 1 ]
778
+ end
779
+ end
731
780
end
732
781
733
782
private
0 commit comments