From 37bf997dc0220a30605249655056e90f04e33366 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 26 Sep 2019 12:00:01 -0700 Subject: lots of grobid tool implementation (still WIP) --- python/sandcrawler/grobid.py | 66 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 3 deletions(-) (limited to 'python/sandcrawler/grobid.py') diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 0e37c0e..a610404 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -1,10 +1,15 @@ import requests +from collections import Counter + +from .workers import SandcrawlerWorker +from .misc import gen_file_metadata +from .ia import WaybackClient, WaybackError class GrobidClient(object): - def __init__(self, host_uri, **kwargs): - self.host_uri = host_uri + def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs): + self.host_url = host_url self.consolidate_mode = int(kwargs.get('consolidate_mode', 1)) def process_fulltext(self, blob, consolidate_mode=None): @@ -23,7 +28,7 @@ class GrobidClient(object): consolidate_mode = self.consolidate_mode grobid_response = requests.post( - self.host_uri + "/api/processFulltextDocument", + self.host_url + "/api/processFulltextDocument", files={ 'input': blob, 'consolidate_mode': self.consolidate_mode, @@ -42,3 +47,58 @@ class GrobidClient(object): info['error_msg'] = grobid_response.text[:10000] return info +class GrobidWorker(SandcrawlerWorker): + + def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs): + super().__init__() + self.grobid_client = grobid_client + self.wayback_client = wayback_client + self.sink = sink + self.consolidate_mode = 1 + + def process(self, record): + if record.get('warc_path') and record.get('warc_offset'): + # it's a full CDX dict. fetch using WaybackClient + if not self.wayback_client: + raise Exception("wayback client not configured for this GrobidWorker") + blob = self.wayback_client.fetch_warc_content(record['warc_path'], + record['warc_offset'], record['warc_csize']) + elif record.get('url') and record.get('datetime'): + # it's a partial CDX dict or something? fetch using WaybackClient + if not self.wayback_client: + raise Exception("wayback client not configured for this GrobidWorker") + blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime']) + elif record.get('item') and record.get('path'): + # it's petabox link; fetch via HTTP + resp = requests.get("https://archive.org/serve/{}/{}".format( + record['item'], record['path'])) + resp.raise_for_status() + blob = resp.body + else: + raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") + assert blob + result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode) + result['file_meta'] = gen_file_metadata(blob) + result['source'] = record + result['key'] = result['file_meta']['sha1hex'] + return result + +class GrobidBlobWorker(SandcrawlerWorker): + """ + This is sort of like GrobidWorker, except it receives blobs directly, + instead of fetching blobs from some remote store. + """ + + def __init__(self, grobid_client, sink=None, **kwargs): + super().__init__() + self.grobid_client = grobid_client + self.sink = sink + self.consolidate_mode = 1 + + def process(self, blob): + assert blob + result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode) + result['file_meta'] = gen_file_metadata(blob) + result['key'] = result['file_meta']['sha1hex'] + return result + -- cgit v1.2.3