webrecorder
diff --git a/‎CHANGES.rst
Lines changed: 30 additions & 0 deletions b/‎CHANGES.rst
Lines changed: 30 additions & 0 deletions
diff --git a/‎README.rst
Lines changed: 20 additions & 4 deletions b/‎README.rst
Lines changed: 20 additions & 4 deletions
diff --git a/‎config.yaml
Lines changed: 6 additions & 0 deletions b/‎config.yaml
Lines changed: 6 additions & 0 deletions
diff --git a/‎pywb/apps/live_rewrite_server.py
Lines changed: 23 additions & 2 deletions b/‎pywb/apps/live_rewrite_server.py
Lines changed: 23 additions & 2 deletions
diff --git a/‎pywb/cdx/cdxserver.py
Lines changed: 1 addition & 1 deletion b/‎pywb/cdx/cdxserver.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pywb/framework/archivalrouter.py
Lines changed: 50 additions & 48 deletions b/‎pywb/framework/archivalrouter.py
Lines changed: 50 additions & 48 deletions
diff --git a/‎pywb/framework/memento.py
Lines changed: 8 additions & 8 deletions b/‎pywb/framework/memento.py
Lines changed: 8 additions & 8 deletions
@@ -1,3 +1,33 @@
+pywb 0.5.0 changelist
+~~~~~~~~~~~~~~~~~~~~~
+
+* Catch live rewrite errors and display more friendly pywb error message.
+
+* LiveRewriteHandler and WBHandler refactoring: LiveRewriteHandler now supports a root search page html template.
+
+* Proxy mode option: 'unaltered_replay' to proxy archival data with no modifications (no banner, no server or client side rewriting).
+
+* Fix client side rewriting (wombat.js) for proxy mode: only rewrite https -> http in absolute urls.
+
+* Fixes to memento timemap/timegate to work with framed replay mode.
+
+* Support for a fallback handler which will be called from a replay handler instead of a 404 response.
+
+  The handler, specified via the ``fallback`` option, can be the name of any other replay handler. Typically, it can be used with a live rewrite handler to fetch missing content from live instead of showing a 404.
+
+* Live Rewrite can now be included as a 'collection type' in a pywb deployment by setting index path to ``$liveweb``.
+
+* ``live-rewrite-server`` has optional ``--proxy host:port`` param to specify a loading live web data through an HTTP/S proxy, such as for use with a recording proxy.
+
+* wombat: add document.cookie -> document.WB_wombat_cookie rewriting to check and rewrite Path= to archival url
+
+* Better parent relative '../' path rewriting, resolved to correct absolute urls when rewritten. Additional testing for parent relative urls.
+
+* New 'proxy_options' block, including 'use_default_coll' to allow defaulting to first collection w/o proxy auth.
+
+* Improved support for proxy mode, allow different collections to be selected via proxy auth
+
+
 pywb 0.4.7 changelist
 ~~~~~~~~~~~~~~~~~~~~~
 
 
@@ -1,4 +1,4 @@
-PyWb 0.4.7
+PyWb 0.5.0
 ==========
 
 .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
@@ -11,9 +11,25 @@ pywb is a python implementation of web archival replay tools, sometimes also kno
 
 pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
 
-*For an example of deployed service using pywb, please see the https://webrecorder.io project*
 
-pywb Tools
+Usage Examples
+-----------------------------
+
+This README contains a basic overview of using pywb. After reading this intro, consider also taking a look at these seperate projects:
+
+* `pywb-webrecorder <https://github.com/ikreymer/pywb-webrecorder>`_ demonstrates a way to use pywb and warcprox to record web content while browsing.
+
+* `pywb-samples <https://github.com/ikreymer/pywb-samples>`_ provides additional archive samples with difficult-to-replay content.
+
+
+The following deployed applications use pywb:
+
+* https://perma.cc embeds pywb as part of a larger `open source application <https://github.com/harvard-lil/perma>`_ to provide web archive replay for law libraries.
+
+* https://webrecorder.io uses pywb and builds upon pywb-webrecorder to create a hosted web recording and replay system.
+
+
+pywb Tools Overview
 -----------------------------
 
 In addition to the standard wayback machine (explained further below), pywb tool suite includes a 
@@ -72,7 +88,7 @@ This process can be done by running the ``cdx-indexer`` script and only needs to
 
 Given an archive of warcs at ``myarchive/warcs``
 
-1. Create a dir for indexs, .eg. ``myarchive/cdx``
+1. Create a dir for indexes, .eg. ``myarchive/cdx``
 
 2. Run ``cdx-indexer --sort myarchive/cdx myarchive/warcs`` to generate .cdx files for each
    warc/arc file in ``myarchive/warcs``
 
@@ -91,6 +91,12 @@ static_routes:
 # Enable simple http proxy mode
 enable_http_proxy: true
 
+# Additional proxy options (defaults)
+#proxy_options:
+#    use_default_coll: true
+#
+#    unaltered_replay: false
+
 # enable cdx server api for querying cdx directly (experimental)
 enable_cdx_api: true
 
 
@@ -2,15 +2,36 @@
 
 from pywb.webapp.live_rewrite_handler import create_live_rewriter_app
 
+from argparse import ArgumentParser
+
+
 #=================================================================
-# init cdx server app
+# init rewrite server app
 #=================================================================
 
-application = init_app(create_live_rewriter_app, load_yaml=False)
+def create_app():
+    parser = ArgumentParser(description='Live Rewrite Server')
+
+    parser.add_argument('-x', '--proxy',
+                        action='store',
+                        help='Specify host:port to use as HTTP/S proxy')
+
+    result, unknown = parser.parse_known_args()
+
+    config = dict(proxyhostport=result.proxy, framed_replay=True)
+
+    app = init_app(create_live_rewriter_app, load_yaml=False,
+                   config=config)
+
+    return app
+
+
+application = create_app()
 
 
 def main():  # pragma: no cover
     start_wsgi_server(application, 'Live Rewriter App', default_port=8090)
 
+
 if __name__ == "__main__":
     main()
@@ -58,7 +58,7 @@ def _check_cdx_iter(self, cdx_iter, query):
                 return self.load_cdx(**fuzzy_query_params)
 
         msg = 'No Captures found for: ' + query.url
-        raise NotFoundException(msg)
+        raise NotFoundException(msg, url=query.url)
 
     def _calc_search_keys(self, query):
         return calc_search_range(url=query.url,
 
@@ -29,16 +29,48 @@ def __init__(self, routes, **kwargs):
         self.error_view = kwargs.get('error_view')
 
     def __call__(self, env):
+        request_uri = env['REL_REQUEST_URI']
+
         for route in self.routes:
-            result = route(env, self.abs_path)
-            if result:
-                return result
+            matcher, coll = route.is_handling(request_uri)
+            if matcher:
+                wbrequest = self.parse_request(route, env, matcher,
+                                               coll, request_uri,
+                                               use_abs_prefix=self.abs_path)
+
+                return route.handler(wbrequest)
 
         # Default Home Page
-        if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']:
+        if request_uri in ['/', '/index.html', '/index.htm']:
             return self.render_home_page(env)
 
-        return self.fallback(env, self.routes) if self.fallback else None
+        return self.fallback(env, self) if self.fallback else None
+
+    def parse_request(self, route, env, matcher, coll, request_uri,
+                      use_abs_prefix=False):
+        matched_str = matcher.group(0)
+        if matched_str:
+            rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
+            # remove the '/' + rel_prefix part of uri
+            wb_url_str = request_uri[len(matched_str) + 2:]
+        else:
+            rel_prefix = env['SCRIPT_NAME'] + '/'
+            # the request_uri is the wb_url, since no coll
+            wb_url_str = request_uri[1:]
+
+        wbrequest = route.request_class(env,
+                              request_uri=request_uri,
+                              wb_url_str=wb_url_str,
+                              rel_prefix=rel_prefix,
+                              coll=coll,
+                              use_abs_prefix=use_abs_prefix,
+                              wburl_class=route.handler.get_wburl_type(),
+                              urlrewriter_class=UrlRewriter)
+
+        # Allow for applying of additional filters
+        route.apply_filters(wbrequest, matcher)
+
+        return wbrequest
 
     def render_home_page(self, env):
         # render the homepage!
@@ -73,45 +105,15 @@ def __init__(self, regex, handler, coll_group=0, config={},
         self.coll_group = coll_group
         self._custom_init(config)
 
-    def __call__(self, env, use_abs_prefix):
-        wbrequest = self.parse_request(env, use_abs_prefix)
-        return self.handler(wbrequest) if wbrequest else None
-
-    def parse_request(self, env, use_abs_prefix, request_uri=None):
-        if not request_uri:
-            request_uri = env['REL_REQUEST_URI']
-
+    def is_handling(self, request_uri):
         matcher = self.regex.match(request_uri[1:])
         if not matcher:
-            return None
-
-        matched_str = matcher.group(0)
-        if matched_str:
-            rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
-            # remove the '/' + rel_prefix part of uri
-            wb_url_str = request_uri[len(matched_str) + 2:]
-        else:
-            rel_prefix = env['SCRIPT_NAME'] + '/'
-            # the request_uri is the wb_url, since no coll
-            wb_url_str = request_uri[1:]
+            return None, None
 
         coll = matcher.group(self.coll_group)
+        return matcher, coll
 
-        wbrequest = self.request_class(env,
-                              request_uri=request_uri,
-                              wb_url_str=wb_url_str,
-                              rel_prefix=rel_prefix,
-                              coll=coll,
-                              use_abs_prefix=use_abs_prefix,
-                              wburl_class=self.handler.get_wburl_type(),
-                              urlrewriter_class=UrlRewriter)
-
-        # Allow for applying of additional filters
-        self._apply_filters(wbrequest, matcher)
-
-        return wbrequest
-
-    def _apply_filters(self, wbrequest, matcher):
+    def apply_filters(self, wbrequest, matcher):
         for filter in self.filters:
             last_grp = len(matcher.groups())
             filter_str = filter.format(matcher.group(last_grp))
@@ -136,9 +138,11 @@ def __init__(self, match_prefixs):
         else:
             self.match_prefixs = [match_prefixs]
 
-    def __call__(self, env, routes):
+    def __call__(self, env, the_router):
         referrer = env.get('HTTP_REFERER')
 
+        routes = the_router.routes
+
         # ensure there is a referrer
         if referrer is None:
             return None
@@ -166,17 +170,15 @@ def __call__(self, env, routes):
         ref_request = None
 
         for route in routes:
-            ref_request = route.parse_request(env, False, request_uri=path)
-            if ref_request:
+            matcher, coll = route.is_handling(path)
+            if matcher:
+                ref_request = the_router.parse_request(route, env,
+                                                       matcher, coll, path)
                 ref_route = route
                 break
 
-        # must have matched one of the routes
-        if not ref_request:
-            return None
-
-        # must have a rewriter
-        if not ref_request.urlrewriter:
+        # must have matched one of the routes with a urlrewriter
+        if not ref_request or not ref_request.urlrewriter:
             return None
 
         rewriter = ref_request.urlrewriter
 
@@ -11,15 +11,13 @@
 #=================================================================
 class MementoReqMixin(object):
     def _parse_extra(self):
-        self.is_timegate = False
-
         if not self.wb_url:
             return
 
         if self.wb_url.type != self.wb_url.LATEST_REPLAY:
             return
 
-        self.is_timegate = True
+        self.options['is_timegate'] = True
 
         accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
         if not accept_datetime:
@@ -48,7 +46,7 @@ def _init_derived(self, params):
         if not wbrequest or not wbrequest.wb_url:
             return
 
-        is_timegate = wbrequest.is_timegate
+        is_timegate = wbrequest.options.get('is_timegate', False)
 
         if is_timegate:
             self.status_headers.headers.append(('Vary', 'accept-datetime'))
@@ -59,7 +57,7 @@ def _init_derived(self, params):
             is_memento = False
 
         # otherwise, if in proxy mode, then always a memento
-        elif wbrequest.is_proxy:
+        elif wbrequest.options['is_proxy']:
             is_memento = True
 
         # otherwise only for replay
@@ -80,7 +78,7 @@ def _init_derived(self, params):
             link.append(self.make_link(req_url, 'original'))
 
         # for now, include timemap only in non-proxy mode
-        if not wbrequest.is_proxy and (is_memento or is_timegate):
+        if not wbrequest.options['is_proxy'] and (is_memento or is_timegate):
             link.append(self.make_timemap_link(wbrequest))
 
         if is_memento and not is_timegate:
@@ -117,6 +115,7 @@ def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
     memento = '<{0}>; rel="{1}"; datetime="{2}"' + end
 
     string = WbUrl.to_wburl_str(url=cdx['original'],
+                                mod='mp_',
                                 timestamp=cdx['timestamp'],
                                 type=WbUrl.REPLAY)
 
@@ -140,15 +139,16 @@ def make_timemap(wbrequest, cdx_lines):
     # timemap link
     timemap = ('<{0}>; rel="self"; ' +
                'type="application/link-format"; from="{1}",\n')
-    yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date)
+    yield timemap.format(prefix + wbrequest.wb_url.to_str(),
+                         from_date)
 
     # original link
     original = '<{0}>; rel="original",\n'
     yield original.format(url)
 
     # timegate link
     timegate = '<{0}>; rel="timegate",\n'
-    yield timegate.format(prefix + url)
+    yield timegate.format(prefix + 'mp_/' + url)
 
     # first memento link
     yield make_memento_link(first_cdx, prefix,