aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-13 17:11:08 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-13 17:11:08 -0800
commit2db64d580e3b5995e328c719e2bd014124179d0b (patch)
tree2603afb605e4e1949d3f850861b80b3f786b2194
parent3370f203c3652ace357eeb69bb8828d830b3596a (diff)
downloadsandcrawler-2db64d580e3b5995e328c719e2bd014124179d0b.tar.gz
sandcrawler-2db64d580e3b5995e328c719e2bd014124179d0b.zip
pdftrio fixes from testing
-rw-r--r--python/sandcrawler/pdftrio.py12
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 5e4630b..168fb78 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -139,12 +139,12 @@ class PdfTrioWorker(SandcrawlerWorker):
key=default_key,
)
result = dict()
- result['key'] = result['file_meta']['sha1hex']
result['file_meta'] = gen_file_metadata(blob)
+ result['key'] = result['file_meta']['sha1hex']
result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
result['source'] = record
result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec'),
+ pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
total_sec=time.time() - start_process,
)
if wayback_sec:
@@ -165,10 +165,16 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
self.sink = sink
def process(self, blob):
+ start_process = time.time()
if not blob:
return None
- result = self.pdftrio_client.classify_pdf(blob)
+ result = dict()
result['file_meta'] = gen_file_metadata(blob)
result['key'] = result['file_meta']['sha1hex']
+ result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
+ result['timing'] = dict(
+ pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ total_sec=time.time() - start_process,
+ )
return result