aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-14 17:01:26 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-14 17:01:26 -0800
commit818a936be9480bb75e40d7e3723aed3ac8c1eee9 (patch)
treef0a0f3b289b724ed529f71f9260920d4fad987bc /python
parent91e6b33a4733fbe622ce0e09460a75cd377bee7a (diff)
downloadsandcrawler-818a936be9480bb75e40d7e3723aed3ac8c1eee9.tar.gz
sandcrawler-818a936be9480bb75e40d7e3723aed3ac8c1eee9.zip
grobid worker fixes for newer ia lib refactors
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/grobid.py12
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 31af974..b989591 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -85,8 +85,11 @@ class GrobidWorker(SandcrawlerWorker):
if not self.wayback_client:
raise Exception("wayback client not configured for this GrobidWorker")
try:
- blob = self.wayback_client.fetch_petabox_body(record['warc_csize'],
- record['warc_offset'], record['warc_path'])
+ blob = self.wayback_client.fetch_petabox_body(
+ csize=record['warc_csize'],
+ offset=record['warc_offset'],
+ warc_path=record['warc_path'],
+ )
except WaybackError as we:
return dict(status="error-wayback", error_msg=str(we), source=record)
elif record.get('url') and record.get('datetime'):
@@ -94,7 +97,10 @@ class GrobidWorker(SandcrawlerWorker):
if not self.wayback_client:
raise Exception("wayback client not configured for this GrobidWorker")
try:
- blob = self.wayback_client.fetch_warc_by_url_dt(record['url'], record['datetime'])
+ blob = self.wayback_client.fetch_replay_body(
+ url=record['url'],
+ datetime=record['datetime'],
+ )
except WaybackError as we:
return dict(status="error-wayback", error_msg=str(we), source=record)
elif record.get('item') and record.get('path'):