From 37bf997dc0220a30605249655056e90f04e33366 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Thu, 26 Sep 2019 12:00:01 -0700
Subject: lots of grobid tool implementation (still WIP)

---
 python/sandcrawler/grobid.py | 66 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 3 deletions(-)

(limited to 'python/sandcrawler/grobid.py')

diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 0e37c0e..a610404 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,10 +1,15 @@
 
 import requests
+from collections import Counter
+
+from .workers import SandcrawlerWorker
+from .misc import gen_file_metadata
+from .ia import WaybackClient, WaybackError
 
 class GrobidClient(object):
 
-    def __init__(self, host_uri, **kwargs):
-        self.host_uri = host_uri
+    def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+        self.host_url = host_url
         self.consolidate_mode = int(kwargs.get('consolidate_mode', 1))
 
     def process_fulltext(self, blob, consolidate_mode=None):
@@ -23,7 +28,7 @@ class GrobidClient(object):
             consolidate_mode = self.consolidate_mode
 
         grobid_response = requests.post(
-            self.host_uri + "/api/processFulltextDocument",
+            self.host_url + "/api/processFulltextDocument",
             files={
                 'input': blob,
                 'consolidate_mode': self.consolidate_mode,
@@ -42,3 +47,58 @@ class GrobidClient(object):
             info['error_msg'] = grobid_response.text[:10000]
         return info
 
+class GrobidWorker(SandcrawlerWorker):
+
+    def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
+        super().__init__()
+        self.grobid_client = grobid_client
+        self.wayback_client = wayback_client
+        self.sink = sink
+        self.consolidate_mode = 1
+
+    def process(self, record):
+        if record.get('warc_path') and record.get('warc_offset'):
+            # it's a full CDX dict. fetch using WaybackClient
+            if not self.wayback_client:
+                raise Exception("wayback client not configured for this GrobidWorker")
+            blob = self.wayback_client.fetch_warc_content(record['warc_path'],
+                record['warc_offset'], record['warc_csize'])
+        elif record.get('url') and record.get('datetime'):
+            # it's a partial CDX dict or something? fetch using WaybackClient
+            if not self.wayback_client:
+                raise Exception("wayback client not configured for this GrobidWorker")
+            blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime'])
+        elif record.get('item') and record.get('path'):
+            # it's petabox link; fetch via HTTP
+            resp = requests.get("https://archive.org/serve/{}/{}".format(
+                record['item'], record['path']))
+            resp.raise_for_status()
+            blob = resp.body
+        else:
+            raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
+        assert blob
+        result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+        result['file_meta'] = gen_file_metadata(blob)
+        result['source'] = record
+        result['key'] = result['file_meta']['sha1hex']
+        return result
+
+class GrobidBlobWorker(SandcrawlerWorker):
+    """
+    This is sort of like GrobidWorker, except it receives blobs directly,
+    instead of fetching blobs from some remote store.
+    """
+
+    def __init__(self, grobid_client, sink=None, **kwargs):
+        super().__init__()
+        self.grobid_client = grobid_client
+        self.sink = sink
+        self.consolidate_mode = 1
+
+    def process(self, blob):
+        assert blob
+        result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+        result['file_meta'] = gen_file_metadata(blob)
+        result['key'] = result['file_meta']['sha1hex']
+        return result
+
-- 
cgit v1.2.3