Skip to content

Commit c3fecb6

Browse files
committed
Merge branch 'develop' for 0.5.0 release
2 parents 7694bf0 + 0b8a8f0 commit c3fecb6

40 files changed

+709
-252
lines changed

CHANGES.rst

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,33 @@
1+
pywb 0.5.0 changelist
2+
~~~~~~~~~~~~~~~~~~~~~
3+
4+
* Catch live rewrite errors and display more friendly pywb error message.
5+
6+
* LiveRewriteHandler and WBHandler refactoring: LiveRewriteHandler now supports a root search page html template.
7+
8+
* Proxy mode option: 'unaltered_replay' to proxy archival data with no modifications (no banner, no server or client side rewriting).
9+
10+
* Fix client side rewriting (wombat.js) for proxy mode: only rewrite https -> http in absolute urls.
11+
12+
* Fixes to memento timemap/timegate to work with framed replay mode.
13+
14+
* Support for a fallback handler which will be called from a replay handler instead of a 404 response.
15+
16+
The handler, specified via the ``fallback`` option, can be the name of any other replay handler. Typically, it can be used with a live rewrite handler to fetch missing content from live instead of showing a 404.
17+
18+
* Live Rewrite can now be included as a 'collection type' in a pywb deployment by setting index path to ``$liveweb``.
19+
20+
* ``live-rewrite-server`` has optional ``--proxy host:port`` param to specify a loading live web data through an HTTP/S proxy, such as for use with a recording proxy.
21+
22+
* wombat: add document.cookie -> document.WB_wombat_cookie rewriting to check and rewrite Path= to archival url
23+
24+
* Better parent relative '../' path rewriting, resolved to correct absolute urls when rewritten. Additional testing for parent relative urls.
25+
26+
* New 'proxy_options' block, including 'use_default_coll' to allow defaulting to first collection w/o proxy auth.
27+
28+
* Improved support for proxy mode, allow different collections to be selected via proxy auth
29+
30+
131
pywb 0.4.7 changelist
232
~~~~~~~~~~~~~~~~~~~~~
333

README.rst

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PyWb 0.4.7
1+
PyWb 0.5.0
22
==========
33

44
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
@@ -11,9 +11,25 @@ pywb is a python implementation of web archival replay tools, sometimes also kno
1111

1212
pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
1313

14-
*For an example of deployed service using pywb, please see the https://webrecorder.io project*
1514

16-
pywb Tools
15+
Usage Examples
16+
-----------------------------
17+
18+
This README contains a basic overview of using pywb. After reading this intro, consider also taking a look at these seperate projects:
19+
20+
* `pywb-webrecorder <https://github.com/ikreymer/pywb-webrecorder>`_ demonstrates a way to use pywb and warcprox to record web content while browsing.
21+
22+
* `pywb-samples <https://github.com/ikreymer/pywb-samples>`_ provides additional archive samples with difficult-to-replay content.
23+
24+
25+
The following deployed applications use pywb:
26+
27+
* https://perma.cc embeds pywb as part of a larger `open source application <https://github.com/harvard-lil/perma>`_ to provide web archive replay for law libraries.
28+
29+
* https://webrecorder.io uses pywb and builds upon pywb-webrecorder to create a hosted web recording and replay system.
30+
31+
32+
pywb Tools Overview
1733
-----------------------------
1834

1935
In addition to the standard wayback machine (explained further below), pywb tool suite includes a
@@ -72,7 +88,7 @@ This process can be done by running the ``cdx-indexer`` script and only needs to
7288

7389
Given an archive of warcs at ``myarchive/warcs``
7490

75-
1. Create a dir for indexs, .eg. ``myarchive/cdx``
91+
1. Create a dir for indexes, .eg. ``myarchive/cdx``
7692

7793
2. Run ``cdx-indexer --sort myarchive/cdx myarchive/warcs`` to generate .cdx files for each
7894
warc/arc file in ``myarchive/warcs``

config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,12 @@ static_routes:
9191
# Enable simple http proxy mode
9292
enable_http_proxy: true
9393

94+
# Additional proxy options (defaults)
95+
#proxy_options:
96+
# use_default_coll: true
97+
#
98+
# unaltered_replay: false
99+
94100
# enable cdx server api for querying cdx directly (experimental)
95101
enable_cdx_api: true
96102

pywb/apps/live_rewrite_server.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,36 @@
22

33
from pywb.webapp.live_rewrite_handler import create_live_rewriter_app
44

5+
from argparse import ArgumentParser
6+
7+
58
#=================================================================
6-
# init cdx server app
9+
# init rewrite server app
710
#=================================================================
811

9-
application = init_app(create_live_rewriter_app, load_yaml=False)
12+
def create_app():
13+
parser = ArgumentParser(description='Live Rewrite Server')
14+
15+
parser.add_argument('-x', '--proxy',
16+
action='store',
17+
help='Specify host:port to use as HTTP/S proxy')
18+
19+
result, unknown = parser.parse_known_args()
20+
21+
config = dict(proxyhostport=result.proxy, framed_replay=True)
22+
23+
app = init_app(create_live_rewriter_app, load_yaml=False,
24+
config=config)
25+
26+
return app
27+
28+
29+
application = create_app()
1030

1131

1232
def main(): # pragma: no cover
1333
start_wsgi_server(application, 'Live Rewriter App', default_port=8090)
1434

35+
1536
if __name__ == "__main__":
1637
main()

pywb/cdx/cdxserver.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def _check_cdx_iter(self, cdx_iter, query):
5858
return self.load_cdx(**fuzzy_query_params)
5959

6060
msg = 'No Captures found for: ' + query.url
61-
raise NotFoundException(msg)
61+
raise NotFoundException(msg, url=query.url)
6262

6363
def _calc_search_keys(self, query):
6464
return calc_search_range(url=query.url,

pywb/framework/archivalrouter.py

Lines changed: 50 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,48 @@ def __init__(self, routes, **kwargs):
2929
self.error_view = kwargs.get('error_view')
3030

3131
def __call__(self, env):
32+
request_uri = env['REL_REQUEST_URI']
33+
3234
for route in self.routes:
33-
result = route(env, self.abs_path)
34-
if result:
35-
return result
35+
matcher, coll = route.is_handling(request_uri)
36+
if matcher:
37+
wbrequest = self.parse_request(route, env, matcher,
38+
coll, request_uri,
39+
use_abs_prefix=self.abs_path)
40+
41+
return route.handler(wbrequest)
3642

3743
# Default Home Page
38-
if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']:
44+
if request_uri in ['/', '/index.html', '/index.htm']:
3945
return self.render_home_page(env)
4046

41-
return self.fallback(env, self.routes) if self.fallback else None
47+
return self.fallback(env, self) if self.fallback else None
48+
49+
def parse_request(self, route, env, matcher, coll, request_uri,
50+
use_abs_prefix=False):
51+
matched_str = matcher.group(0)
52+
if matched_str:
53+
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
54+
# remove the '/' + rel_prefix part of uri
55+
wb_url_str = request_uri[len(matched_str) + 2:]
56+
else:
57+
rel_prefix = env['SCRIPT_NAME'] + '/'
58+
# the request_uri is the wb_url, since no coll
59+
wb_url_str = request_uri[1:]
60+
61+
wbrequest = route.request_class(env,
62+
request_uri=request_uri,
63+
wb_url_str=wb_url_str,
64+
rel_prefix=rel_prefix,
65+
coll=coll,
66+
use_abs_prefix=use_abs_prefix,
67+
wburl_class=route.handler.get_wburl_type(),
68+
urlrewriter_class=UrlRewriter)
69+
70+
# Allow for applying of additional filters
71+
route.apply_filters(wbrequest, matcher)
72+
73+
return wbrequest
4274

4375
def render_home_page(self, env):
4476
# render the homepage!
@@ -73,45 +105,15 @@ def __init__(self, regex, handler, coll_group=0, config={},
73105
self.coll_group = coll_group
74106
self._custom_init(config)
75107

76-
def __call__(self, env, use_abs_prefix):
77-
wbrequest = self.parse_request(env, use_abs_prefix)
78-
return self.handler(wbrequest) if wbrequest else None
79-
80-
def parse_request(self, env, use_abs_prefix, request_uri=None):
81-
if not request_uri:
82-
request_uri = env['REL_REQUEST_URI']
83-
108+
def is_handling(self, request_uri):
84109
matcher = self.regex.match(request_uri[1:])
85110
if not matcher:
86-
return None
87-
88-
matched_str = matcher.group(0)
89-
if matched_str:
90-
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
91-
# remove the '/' + rel_prefix part of uri
92-
wb_url_str = request_uri[len(matched_str) + 2:]
93-
else:
94-
rel_prefix = env['SCRIPT_NAME'] + '/'
95-
# the request_uri is the wb_url, since no coll
96-
wb_url_str = request_uri[1:]
111+
return None, None
97112

98113
coll = matcher.group(self.coll_group)
114+
return matcher, coll
99115

100-
wbrequest = self.request_class(env,
101-
request_uri=request_uri,
102-
wb_url_str=wb_url_str,
103-
rel_prefix=rel_prefix,
104-
coll=coll,
105-
use_abs_prefix=use_abs_prefix,
106-
wburl_class=self.handler.get_wburl_type(),
107-
urlrewriter_class=UrlRewriter)
108-
109-
# Allow for applying of additional filters
110-
self._apply_filters(wbrequest, matcher)
111-
112-
return wbrequest
113-
114-
def _apply_filters(self, wbrequest, matcher):
116+
def apply_filters(self, wbrequest, matcher):
115117
for filter in self.filters:
116118
last_grp = len(matcher.groups())
117119
filter_str = filter.format(matcher.group(last_grp))
@@ -136,9 +138,11 @@ def __init__(self, match_prefixs):
136138
else:
137139
self.match_prefixs = [match_prefixs]
138140

139-
def __call__(self, env, routes):
141+
def __call__(self, env, the_router):
140142
referrer = env.get('HTTP_REFERER')
141143

144+
routes = the_router.routes
145+
142146
# ensure there is a referrer
143147
if referrer is None:
144148
return None
@@ -166,17 +170,15 @@ def __call__(self, env, routes):
166170
ref_request = None
167171

168172
for route in routes:
169-
ref_request = route.parse_request(env, False, request_uri=path)
170-
if ref_request:
173+
matcher, coll = route.is_handling(path)
174+
if matcher:
175+
ref_request = the_router.parse_request(route, env,
176+
matcher, coll, path)
171177
ref_route = route
172178
break
173179

174-
# must have matched one of the routes
175-
if not ref_request:
176-
return None
177-
178-
# must have a rewriter
179-
if not ref_request.urlrewriter:
180+
# must have matched one of the routes with a urlrewriter
181+
if not ref_request or not ref_request.urlrewriter:
180182
return None
181183

182184
rewriter = ref_request.urlrewriter

pywb/framework/memento.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,13 @@
1111
#=================================================================
1212
class MementoReqMixin(object):
1313
def _parse_extra(self):
14-
self.is_timegate = False
15-
1614
if not self.wb_url:
1715
return
1816

1917
if self.wb_url.type != self.wb_url.LATEST_REPLAY:
2018
return
2119

22-
self.is_timegate = True
20+
self.options['is_timegate'] = True
2321

2422
accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
2523
if not accept_datetime:
@@ -48,7 +46,7 @@ def _init_derived(self, params):
4846
if not wbrequest or not wbrequest.wb_url:
4947
return
5048

51-
is_timegate = wbrequest.is_timegate
49+
is_timegate = wbrequest.options.get('is_timegate', False)
5250

5351
if is_timegate:
5452
self.status_headers.headers.append(('Vary', 'accept-datetime'))
@@ -59,7 +57,7 @@ def _init_derived(self, params):
5957
is_memento = False
6058

6159
# otherwise, if in proxy mode, then always a memento
62-
elif wbrequest.is_proxy:
60+
elif wbrequest.options['is_proxy']:
6361
is_memento = True
6462

6563
# otherwise only for replay
@@ -80,7 +78,7 @@ def _init_derived(self, params):
8078
link.append(self.make_link(req_url, 'original'))
8179

8280
# for now, include timemap only in non-proxy mode
83-
if not wbrequest.is_proxy and (is_memento or is_timegate):
81+
if not wbrequest.options['is_proxy'] and (is_memento or is_timegate):
8482
link.append(self.make_timemap_link(wbrequest))
8583

8684
if is_memento and not is_timegate:
@@ -117,6 +115,7 @@ def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
117115
memento = '<{0}>; rel="{1}"; datetime="{2}"' + end
118116

119117
string = WbUrl.to_wburl_str(url=cdx['original'],
118+
mod='mp_',
120119
timestamp=cdx['timestamp'],
121120
type=WbUrl.REPLAY)
122121

@@ -140,15 +139,16 @@ def make_timemap(wbrequest, cdx_lines):
140139
# timemap link
141140
timemap = ('<{0}>; rel="self"; ' +
142141
'type="application/link-format"; from="{1}",\n')
143-
yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date)
142+
yield timemap.format(prefix + wbrequest.wb_url.to_str(),
143+
from_date)
144144

145145
# original link
146146
original = '<{0}>; rel="original",\n'
147147
yield original.format(url)
148148

149149
# timegate link
150150
timegate = '<{0}>; rel="timegate",\n'
151-
yield timegate.format(prefix + url)
151+
yield timegate.format(prefix + 'mp_/' + url)
152152

153153
# first memento link
154154
yield make_memento_link(first_cdx, prefix,

0 commit comments

Comments
 (0)