aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/pdftrio.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/pdftrio.py')
-rw-r--r--python/sandcrawler/pdftrio.py158
1 files changed, 158 insertions, 0 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
new file mode 100644
index 0000000..a2eedd8
--- /dev/null
+++ b/python/sandcrawler/pdftrio.py
@@ -0,0 +1,158 @@
+
+import requests
+
+from .workers import SandcrawlerWorker
+from .misc import gen_file_metadata, requests_retry_session
+from .ia import WaybackClient, WaybackError, PetaboxError
+
+
+class PdfTrioClient(object):
+
+ def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
+ self.host_url = host_url
+ self.http_session = requests_retry_session(retries=3, backoff_factor=3)
+
+ def classify_pdf(self, blob):
+ """
+ Returns a dict with at least:
+
+ - status_code (int, always set)
+ - status (success, or error-*)
+
+ On success, the other remote API JSON response keys are also included.
+
+ On HTTP-level failures, the status_code and status field are set
+ appropriately; an optional `error_msg` may also be set. For some other
+ errors, like connection failure, an exception is raised.
+ """
+ assert blob
+
+ try:
+ pdftrio_response = requests.post(
+ self.host_url + "/classify/pdf/all",
+ files={
+ 'pdf': blob,
+ },
+ timeout=30.0,
+ )
+ except requests.Timeout:
+ return {
+ 'status': 'error-timeout',
+ 'status_code': -4, # heritrix3 "HTTP timeout" code
+ 'error_msg': 'pdftrio request (HTTP POST) timeout',
+ }
+
+ info = dict(
+ status_code=pdftrio_response.status_code,
+ )
+ if pdftrio_response.status_code == 200:
+ resp_json = pdftrio_response.json()
+ assert 'ensemble_score' in resp_json
+ assert 'status' in resp_json
+ assert 'versions' in resp_json
+ info.update(resp_json)
+ else:
+ info['status'] = 'error'
+ # TODO: might return JSON with some info?
+
+ # add this timing info at end so it isn't clobbered by an update()
+ if not info.get('timing'):
+ info['timing'] = dict()
+ info['timing']['total_sec'] = pdftrio_response.elapsed.total_seconds(),
+ return info
+
+
+class PdfTrioWorker(SandcrawlerWorker):
+ """
+ This class is basically copied directly from GrobidWorker
+ """
+
+ def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
+ super().__init__()
+ self.pdftrio_client = pdftrio_client
+ self.wayback_client = wayback_client
+ self.sink = sink
+
+ def process(self, record):
+ default_key = record['sha1hex']
+ if record.get('warc_path') and record.get('warc_offset'):
+ # it's a full CDX dict. fetch using WaybackClient
+ if not self.wayback_client:
+ raise Exception("wayback client not configured for this PdfTrioWorker")
+ try:
+ blob = self.wayback_client.fetch_petabox_body(
+ csize=record['warc_csize'],
+ offset=record['warc_offset'],
+ warc_path=record['warc_path'],
+ )
+ except (WaybackError, PetaboxError) as we:
+ return dict(
+ status="error-wayback",
+ error_msg=str(we),
+ source=record,
+ key=default_key,
+ )
+ elif record.get('url') and record.get('datetime'):
+ # it's a partial CDX dict or something? fetch using WaybackClient
+ if not self.wayback_client:
+ raise Exception("wayback client not configured for this PdfTrioWorker")
+ try:
+ blob = self.wayback_client.fetch_replay_body(
+ url=record['url'],
+ datetime=record['datetime'],
+ )
+ except WaybackError as we:
+ return dict(
+ status="error-wayback",
+ error_msg=str(we),
+ source=record,
+ key=default_key,
+ )
+ elif record.get('item') and record.get('path'):
+ # it's petabox link; fetch via HTTP
+ resp = requests.get("https://archive.org/serve/{}/{}".format(
+ record['item'], record['path']))
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ return dict(
+ status="error-petabox",
+ error_msg=str(e),
+ source=record,
+ key=default_key,
+ )
+ blob = resp.content
+ else:
+ raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
+ if not blob:
+ return dict(
+ status="error",
+ error_msg="empty blob",
+ source=record,
+ key=default_key,
+ )
+ result = self.pdftrio_client.classify_pdf(blob)
+ result['file_meta'] = gen_file_metadata(blob)
+ result['source'] = record
+ result['key'] = result['file_meta']['sha1hex']
+ return result
+
+class PdfTrioBlobWorker(SandcrawlerWorker):
+ """
+ This is sort of like PdfTrioWorker, except it receives blobs directly,
+ instead of fetching blobs from some remote store.
+ """
+
+ def __init__(self, pdftrio_client, sink=None, **kwargs):
+ super().__init__()
+ self.pdftrio_client = pdftrio_client
+ self.sink = sink
+
+ def process(self, blob):
+ if not blob:
+ return None
+ result = self.pdftrio_client.classify_pdf(blob)
+ result['file_meta'] = gen_file_metadata(blob)
+ result['key'] = result['file_meta']['sha1hex']
+ return result
+