1 files changed, 130 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
new file mode 100644
index 0000000..b4215dc
--- /dev/null
+++ b/python/sandcrawler/grobid.py
@@ -0,0 +1,130 @@
+
+import requests
+
+from grobid2json import teixml2json
+from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .misc import gen_file_metadata
+
+class GrobidClient(object):
+
+    def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+        self.host_url = host_url
+        self.consolidate_mode = int(kwargs.get('consolidate_mode', 0))
+
+    def process_fulltext(self, blob, consolidate_mode=None):
+        """
+        Returns dict with keys:
+            - status_code
+            - status (slug)
+            - error_msg (if status == 'error')
+            - tei_xml (if status is 200)
+
+        TODO: persist connection for performance?
+        """
+        assert blob
+
+        if consolidate_mode == None:
+            consolidate_mode = self.consolidate_mode
+
+        try:
+            grobid_response = requests.post(
+                self.host_url + "/api/processFulltextDocument",
+                files={
+                    'input': blob,
+                    'consolidateHeader': self.consolidate_mode,
+                    'consolidateCitations': 0, # too expensive for now
+                    'includeRawCitations': 1,
+                },
+                timeout=180.0,
+            )
+        except requests.Timeout:
+            return {
+                'status': 'error-timeout',
+                'status_code': -4,  # heritrix3 "HTTP timeout" code
+                'error_msg': 'GROBID request (HTTP POST) timeout',
+            }
+
+        info = dict(
+            status_code=grobid_response.status_code,
+        )
+        if grobid_response.status_code == 200:
+            info['status'] = 'success'
+            info['tei_xml'] = grobid_response.text
+            if len(info['tei_xml']) > 12000000:
+                # XML is larger than Kafka message size, and much larger than
+                # an article in general; bail out
+                info['status'] = 'error'
+                info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml']))
+                info.pop('tei_xml')
+        else:
+            # response.text is .content decoded as utf-8
+            info['status'] = 'error'
+            info['error_msg'] = grobid_response.text[:10000]
+        return info
+
+    def metadata(self, result):
+        if result['status'] != 'success':
+            return None
+        tei_json = teixml2json(result['tei_xml'], encumbered=False)
+        meta = dict()
+        biblio = dict()
+        for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+            if tei_json.get(k):
+                biblio[k] = tei_json[k]
+        meta['biblio'] = biblio
+        for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+            if tei_json.get(k):
+                meta[k] = tei_json[k]
+        return meta
+
+class GrobidWorker(SandcrawlerFetchWorker):
+
+    def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
+        super().__init__(wayback_client=wayback_client)
+        self.grobid_client = grobid_client
+        self.sink = sink
+        self.consolidate_mode = 0
+
+    def timeout_response(self, task):
+        default_key = task['sha1hex']
+        return dict(
+            status="error-timeout",
+            error_msg="internal GROBID worker timeout",
+            source=task,
+            key=default_key,
+        )
+
+    def process(self, record, key=None):
+        default_key = record['sha1hex']
+
+        fetch_result = self.fetch_blob(record)
+        if fetch_result['status'] != 'success':
+            return fetch_result
+        blob = fetch_result['blob']
+
+        result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+        result['file_meta'] = gen_file_metadata(blob)
+        result['source'] = record
+        result['key'] = result['file_meta']['sha1hex']
+        return result
+
+class GrobidBlobWorker(SandcrawlerWorker):
+    """
+    This is sort of like GrobidWorker, except it receives blobs directly,
+    instead of fetching blobs from some remote store.
+    """
+
+    def __init__(self, grobid_client, sink=None, **kwargs):
+        super().__init__()
+        self.grobid_client = grobid_client
+        self.sink = sink
+        self.consolidate_mode = 0
+
+    def process(self, blob, key=None):
+        if not blob:
+            return None
+        result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+        result['file_meta'] = gen_file_metadata(blob)
+        result['key'] = result['file_meta']['sha1hex']
+        return result
+