aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-04 22:10:44 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-04 22:10:46 -0800
commit583f11aa95b3af5897d29f143f99716a257e9357 (patch)
treebe2ac0c6751f0bbde1eaac91bd496b4953fdca10
parent33987348cae25ab57e546c2099ae8f3fd1caffd6 (diff)
downloadsandcrawler-583f11aa95b3af5897d29f143f99716a257e9357.tar.gz
sandcrawler-583f11aa95b3af5897d29f143f99716a257e9357.zip
ia: use newer gwb (petabox) loading class
This fixes zstandard WARC reading.
-rw-r--r--python/sandcrawler/ia.py13
1 files changed, 8 insertions, 5 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a3d8249..0b58f3b 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -21,7 +21,7 @@ http.client._MAXHEADERS = 1000 # type: ignore
import wayback.exception
from http.client import IncompleteRead
from wayback.resourcestore import ResourceStore
-from gwb.loader import CDXLoaderFactory
+from gwb.loader import CDXLoaderFactory3
from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
@@ -360,9 +360,9 @@ class WaybackClient:
raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory3(
webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ ))
try:
#print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
@@ -406,8 +406,11 @@ class WaybackClient:
# convert revisit_dt
# len("2018-07-24T11:56:49"), or with "Z"
assert len(revisit_dt) in (19, 20)
- revisit_uri = revisit_uri.decode('utf-8')
- revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
+ if type(revisit_uri) is bytes:
+ revisit_uri = revisit_uri.decode('utf-8')
+ if type(revisit_dt) is bytes:
+ revisit_dt = revisit_dt.decode('utf-8')
+ revisit_dt = revisit_dt.replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
assert len(revisit_dt) == 14
try:
revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)