diff options
Diffstat (limited to 'python/sandcrawler/pdftrio.py')
-rw-r--r-- | python/sandcrawler/pdftrio.py | 158 |
1 files changed, 158 insertions, 0 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py new file mode 100644 index 0000000..a2eedd8 --- /dev/null +++ b/python/sandcrawler/pdftrio.py @@ -0,0 +1,158 @@ + +import requests + +from .workers import SandcrawlerWorker +from .misc import gen_file_metadata, requests_retry_session +from .ia import WaybackClient, WaybackError, PetaboxError + + +class PdfTrioClient(object): + + def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs): + self.host_url = host_url + self.http_session = requests_retry_session(retries=3, backoff_factor=3) + + def classify_pdf(self, blob): + """ + Returns a dict with at least: + + - status_code (int, always set) + - status (success, or error-*) + + On success, the other remote API JSON response keys are also included. + + On HTTP-level failures, the status_code and status field are set + appropriately; an optional `error_msg` may also be set. For some other + errors, like connection failure, an exception is raised. + """ + assert blob + + try: + pdftrio_response = requests.post( + self.host_url + "/classify/pdf/all", + files={ + 'pdf': blob, + }, + timeout=30.0, + ) + except requests.Timeout: + return { + 'status': 'error-timeout', + 'status_code': -4, # heritrix3 "HTTP timeout" code + 'error_msg': 'pdftrio request (HTTP POST) timeout', + } + + info = dict( + status_code=pdftrio_response.status_code, + ) + if pdftrio_response.status_code == 200: + resp_json = pdftrio_response.json() + assert 'ensemble_score' in resp_json + assert 'status' in resp_json + assert 'versions' in resp_json + info.update(resp_json) + else: + info['status'] = 'error' + # TODO: might return JSON with some info? + + # add this timing info at end so it isn't clobbered by an update() + if not info.get('timing'): + info['timing'] = dict() + info['timing']['total_sec'] = pdftrio_response.elapsed.total_seconds(), + return info + + +class PdfTrioWorker(SandcrawlerWorker): + """ + This class is basically copied directly from GrobidWorker + """ + + def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs): + super().__init__() + self.pdftrio_client = pdftrio_client + self.wayback_client = wayback_client + self.sink = sink + + def process(self, record): + default_key = record['sha1hex'] + if record.get('warc_path') and record.get('warc_offset'): + # it's a full CDX dict. fetch using WaybackClient + if not self.wayback_client: + raise Exception("wayback client not configured for this PdfTrioWorker") + try: + blob = self.wayback_client.fetch_petabox_body( + csize=record['warc_csize'], + offset=record['warc_offset'], + warc_path=record['warc_path'], + ) + except (WaybackError, PetaboxError) as we: + return dict( + status="error-wayback", + error_msg=str(we), + source=record, + key=default_key, + ) + elif record.get('url') and record.get('datetime'): + # it's a partial CDX dict or something? fetch using WaybackClient + if not self.wayback_client: + raise Exception("wayback client not configured for this PdfTrioWorker") + try: + blob = self.wayback_client.fetch_replay_body( + url=record['url'], + datetime=record['datetime'], + ) + except WaybackError as we: + return dict( + status="error-wayback", + error_msg=str(we), + source=record, + key=default_key, + ) + elif record.get('item') and record.get('path'): + # it's petabox link; fetch via HTTP + resp = requests.get("https://archive.org/serve/{}/{}".format( + record['item'], record['path'])) + try: + resp.raise_for_status() + except Exception as e: + return dict( + status="error-petabox", + error_msg=str(e), + source=record, + key=default_key, + ) + blob = resp.content + else: + raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") + if not blob: + return dict( + status="error", + error_msg="empty blob", + source=record, + key=default_key, + ) + result = self.pdftrio_client.classify_pdf(blob) + result['file_meta'] = gen_file_metadata(blob) + result['source'] = record + result['key'] = result['file_meta']['sha1hex'] + return result + +class PdfTrioBlobWorker(SandcrawlerWorker): + """ + This is sort of like PdfTrioWorker, except it receives blobs directly, + instead of fetching blobs from some remote store. + """ + + def __init__(self, pdftrio_client, sink=None, **kwargs): + super().__init__() + self.pdftrio_client = pdftrio_client + self.sink = sink + + def process(self, blob): + if not blob: + return None + result = self.pdftrio_client.classify_pdf(blob) + result['file_meta'] = gen_file_metadata(blob) + result['key'] = result['file_meta']['sha1hex'] + return result + |