Skip to content

Commit bff10e7

Browse files
Initial implementation of a composite snapshot
see issue #22. TBF
1 parent 3d181ce commit bff10e7

File tree

1 file changed

+52
-3
lines changed

1 file changed

+52
-3
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ class WaybackMachineDownloader
128128

129129
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
130130
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
131-
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
131+
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
132+
:snapshot_at
132133

133134
def initialize params
134135
validate_params(params)
@@ -158,6 +159,7 @@ def initialize params
158159
@rewrite = params[:rewrite] || false
159160
@recursive_subdomains = params[:recursive_subdomains] || false
160161
@subdomain_depth = params[:subdomain_depth] || 1
162+
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
161163

162164
# URL for rejecting invalid/unencoded wayback urls
163165
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -330,6 +332,36 @@ def get_all_snapshots_to_consider
330332
snapshot_list_to_consider
331333
end
332334

335+
# Get a composite snapshot file list for a specific timestamp
336+
def get_composite_snapshot_file_list(target_timestamp)
337+
file_versions = {}
338+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
339+
next unless file_url.include?('/')
340+
next if file_timestamp.to_i > target_timestamp
341+
file_id = file_url.split('/')[3..-1].join('/')
342+
file_id = CGI::unescape file_id
343+
file_id = file_id.tidy_bytes unless file_id == ""
344+
next if file_id.nil?
345+
next if match_exclude_filter(file_url)
346+
next unless match_only_filter(file_url)
347+
# Select the most recent version <= target_timestamp
348+
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
349+
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
350+
end
351+
end
352+
file_versions.values
353+
end
354+
355+
# Returns a list of files for the composite snapshot
356+
def get_file_list_composite_snapshot(target_timestamp)
357+
file_list = get_composite_snapshot_file_list(target_timestamp)
358+
file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
359+
file_list.map do |file_remote_info|
360+
file_remote_info[1][:file_id] = file_remote_info[0]
361+
file_remote_info[1]
362+
end
363+
end
364+
333365
def get_file_list_curated
334366
file_list_curated = Hash.new
335367
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
@@ -384,7 +416,9 @@ def get_file_list_all_timestamps
384416

385417

386418
def get_file_list_by_timestamp
387-
if @all_timestamps
419+
if @snapshot_at
420+
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
421+
elsif @all_timestamps
388422
file_list_curated = get_file_list_all_timestamps
389423
file_list_curated.map do |file_remote_info|
390424
file_remote_info[1][:file_id] = file_remote_info[0]
@@ -727,7 +761,22 @@ def file_queue
727761
end
728762

729763
def file_list_by_timestamp
730-
@file_list_by_timestamp ||= get_file_list_by_timestamp
764+
if @snapshot_at
765+
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
766+
elsif @all_timestamps
767+
file_list_curated = get_file_list_all_timestamps
768+
file_list_curated.map do |file_remote_info|
769+
file_remote_info[1][:file_id] = file_remote_info[0]
770+
file_remote_info[1]
771+
end
772+
else
773+
file_list_curated = get_file_list_curated
774+
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
775+
file_list_curated.map do |file_remote_info|
776+
file_remote_info[1][:file_id] = file_remote_info[0]
777+
file_remote_info[1]
778+
end
779+
end
731780
end
732781

733782
private

0 commit comments

Comments
 (0)