diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-04 22:10:44 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-04 22:10:46 -0800 |
commit | 583f11aa95b3af5897d29f143f99716a257e9357 (patch) | |
tree | be2ac0c6751f0bbde1eaac91bd496b4953fdca10 /python | |
parent | 33987348cae25ab57e546c2099ae8f3fd1caffd6 (diff) | |
download | sandcrawler-583f11aa95b3af5897d29f143f99716a257e9357.tar.gz sandcrawler-583f11aa95b3af5897d29f143f99716a257e9357.zip |
ia: use newer gwb (petabox) loading class
This fixes zstandard WARC reading.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 13 |
1 files changed, 8 insertions, 5 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index a3d8249..0b58f3b 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -21,7 +21,7 @@ http.client._MAXHEADERS = 1000 # type: ignore import wayback.exception from http.client import IncompleteRead from wayback.resourcestore import ResourceStore -from gwb.loader import CDXLoaderFactory +from gwb.loader import CDXLoaderFactory3 from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url @@ -360,9 +360,9 @@ class WaybackClient: raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)) warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: - self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( + self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory3( webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + )) try: #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) gwb_record = self.rstore.load_resource(warc_uri, offset, csize) @@ -406,8 +406,11 @@ class WaybackClient: # convert revisit_dt # len("2018-07-24T11:56:49"), or with "Z" assert len(revisit_dt) in (19, 20) - revisit_uri = revisit_uri.decode('utf-8') - revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '').replace('Z', '') + if type(revisit_uri) is bytes: + revisit_uri = revisit_uri.decode('utf-8') + if type(revisit_dt) is bytes: + revisit_dt = revisit_dt.decode('utf-8') + revisit_dt = revisit_dt.replace('-', '').replace(':', '').replace('T', '').replace('Z', '') assert len(revisit_dt) == 14 try: revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) |