diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 17:01:26 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 17:01:26 -0800 |
commit | 818a936be9480bb75e40d7e3723aed3ac8c1eee9 (patch) | |
tree | f0a0f3b289b724ed529f71f9260920d4fad987bc /python | |
parent | 91e6b33a4733fbe622ce0e09460a75cd377bee7a (diff) | |
download | sandcrawler-818a936be9480bb75e40d7e3723aed3ac8c1eee9.tar.gz sandcrawler-818a936be9480bb75e40d7e3723aed3ac8c1eee9.zip |
grobid worker fixes for newer ia lib refactors
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/grobid.py | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 31af974..b989591 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -85,8 +85,11 @@ class GrobidWorker(SandcrawlerWorker): if not self.wayback_client: raise Exception("wayback client not configured for this GrobidWorker") try: - blob = self.wayback_client.fetch_petabox_body(record['warc_csize'], - record['warc_offset'], record['warc_path']) + blob = self.wayback_client.fetch_petabox_body( + csize=record['warc_csize'], + offset=record['warc_offset'], + warc_path=record['warc_path'], + ) except WaybackError as we: return dict(status="error-wayback", error_msg=str(we), source=record) elif record.get('url') and record.get('datetime'): @@ -94,7 +97,10 @@ class GrobidWorker(SandcrawlerWorker): if not self.wayback_client: raise Exception("wayback client not configured for this GrobidWorker") try: - blob = self.wayback_client.fetch_warc_by_url_dt(record['url'], record['datetime']) + blob = self.wayback_client.fetch_replay_body( + url=record['url'], + datetime=record['datetime'], + ) except WaybackError as we: return dict(status="error-wayback", error_msg=str(we), source=record) elif record.get('item') and record.get('path'): |