From 2db64d580e3b5995e328c719e2bd014124179d0b Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Thu, 13 Feb 2020 17:11:08 -0800
Subject: pdftrio fixes from testing

---
 python/sandcrawler/pdftrio.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'python')

diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 5e4630b..168fb78 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -139,12 +139,12 @@ class PdfTrioWorker(SandcrawlerWorker):
                 key=default_key,
             )
         result = dict()
-        result['key'] = result['file_meta']['sha1hex']
         result['file_meta'] = gen_file_metadata(blob)
+        result['key'] = result['file_meta']['sha1hex']
         result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
         result['source'] = record
         result['timing'] = dict(
-            pdftrio_sec=result['pdf_trio'].pop('_total_sec'),
+            pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
             total_sec=time.time() - start_process,
         )
         if wayback_sec:
@@ -165,10 +165,16 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
         self.sink = sink
 
     def process(self, blob):
+        start_process = time.time()
         if not blob:
             return None
-        result = self.pdftrio_client.classify_pdf(blob)
+        result = dict()
         result['file_meta'] = gen_file_metadata(blob)
         result['key'] = result['file_meta']['sha1hex']
+        result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
+        result['timing'] = dict(
+            pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+            total_sec=time.time() - start_process,
+        )
         return result
 
-- 
cgit v1.2.3