From 4cf5345040b4e8a5d77ca3ceb0f7ea4f8c5778dc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 18 Feb 2020 19:01:58 -0800 Subject: pdftrio: mode controlled by CLI arg --- python/sandcrawler/pdftrio.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'python/sandcrawler/pdftrio.py') diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 52d1b8d..7a2e53c 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -13,7 +13,7 @@ class PdfTrioClient(object): self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) - def classify_pdf(self, blob): + def classify_pdf(self, blob, mode="auto"): """ Returns a dict with at least: @@ -30,7 +30,7 @@ class PdfTrioClient(object): try: pdftrio_response = requests.post( - self.host_url + "/classify/research-pub/all", + self.host_url + "/classify/research-pub/" + mode, files={ 'pdf_content': blob, }, @@ -167,10 +167,11 @@ class PdfTrioBlobWorker(SandcrawlerWorker): instead of fetching blobs from some remote store. """ - def __init__(self, pdftrio_client, sink=None, **kwargs): + def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs): super().__init__() self.pdftrio_client = pdftrio_client self.sink = sink + self.mode = mode def process(self, blob): start_process = time.time() @@ -179,7 +180,7 @@ class PdfTrioBlobWorker(SandcrawlerWorker): result = dict() result['file_meta'] = gen_file_metadata(blob) result['key'] = result['file_meta']['sha1hex'] - result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob) + result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=mode) result['timing'] = dict( pdftrio_sec=result['pdf_trio'].pop('_total_sec', None), total_sec=time.time() - start_process, -- cgit v1.2.3