diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-02-21 12:25:30 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-02-21 12:25:30 -0800 |
commit | 0fa705d4e25e329bd976c399acb58cb95d0d424f (patch) | |
tree | 5c7ff0da18084a65ecf04a15d5e5589113b9e3ad /python/extraction_ungrobided.py | |
parent | 4225fe89836b72e771e612139d0f5561088a6909 (diff) | |
download | sandcrawler-0fa705d4e25e329bd976c399acb58cb95d0d424f.tar.gz sandcrawler-0fa705d4e25e329bd976c399acb58cb95d0d424f.zip |
python test fixes
Diffstat (limited to 'python/extraction_ungrobided.py')
-rwxr-xr-x | python/extraction_ungrobided.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index 99d4f13..225e46f 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -17,6 +17,7 @@ Requires: # in `wayback` library. Means we can't run pylint. # pylint: skip-file +import os import xml import json import raven @@ -122,11 +123,11 @@ class MRExtractUnGrobided(MRJob): return info, None def fetch_warc_content(self, warc_path, offset, c_size): - warc_uri = self.warc_uri_prefix + warc_path + warc_uri = self.options.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + download_base_url=self.options.warc_uri_prefix)) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: |