aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py66
1 files changed, 63 insertions, 3 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 0e37c0e..a610404 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,10 +1,15 @@
import requests
+from collections import Counter
+
+from .workers import SandcrawlerWorker
+from .misc import gen_file_metadata
+from .ia import WaybackClient, WaybackError
class GrobidClient(object):
- def __init__(self, host_uri, **kwargs):
- self.host_uri = host_uri
+ def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+ self.host_url = host_url
self.consolidate_mode = int(kwargs.get('consolidate_mode', 1))
def process_fulltext(self, blob, consolidate_mode=None):
@@ -23,7 +28,7 @@ class GrobidClient(object):
consolidate_mode = self.consolidate_mode
grobid_response = requests.post(
- self.host_uri + "/api/processFulltextDocument",
+ self.host_url + "/api/processFulltextDocument",
files={
'input': blob,
'consolidate_mode': self.consolidate_mode,
@@ -42,3 +47,58 @@ class GrobidClient(object):
info['error_msg'] = grobid_response.text[:10000]
return info
+class GrobidWorker(SandcrawlerWorker):
+
+ def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
+ super().__init__()
+ self.grobid_client = grobid_client
+ self.wayback_client = wayback_client
+ self.sink = sink
+ self.consolidate_mode = 1
+
+ def process(self, record):
+ if record.get('warc_path') and record.get('warc_offset'):
+ # it's a full CDX dict. fetch using WaybackClient
+ if not self.wayback_client:
+ raise Exception("wayback client not configured for this GrobidWorker")
+ blob = self.wayback_client.fetch_warc_content(record['warc_path'],
+ record['warc_offset'], record['warc_csize'])
+ elif record.get('url') and record.get('datetime'):
+ # it's a partial CDX dict or something? fetch using WaybackClient
+ if not self.wayback_client:
+ raise Exception("wayback client not configured for this GrobidWorker")
+ blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime'])
+ elif record.get('item') and record.get('path'):
+ # it's petabox link; fetch via HTTP
+ resp = requests.get("https://archive.org/serve/{}/{}".format(
+ record['item'], record['path']))
+ resp.raise_for_status()
+ blob = resp.body
+ else:
+ raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
+ assert blob
+ result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+ result['file_meta'] = gen_file_metadata(blob)
+ result['source'] = record
+ result['key'] = result['file_meta']['sha1hex']
+ return result
+
+class GrobidBlobWorker(SandcrawlerWorker):
+ """
+ This is sort of like GrobidWorker, except it receives blobs directly,
+ instead of fetching blobs from some remote store.
+ """
+
+ def __init__(self, grobid_client, sink=None, **kwargs):
+ super().__init__()
+ self.grobid_client = grobid_client
+ self.sink = sink
+ self.consolidate_mode = 1
+
+ def process(self, blob):
+ assert blob
+ result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+ result['file_meta'] = gen_file_metadata(blob)
+ result['key'] = result['file_meta']['sha1hex']
+ return result
+