diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-13 17:11:08 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-13 17:11:08 -0800 |
commit | 2db64d580e3b5995e328c719e2bd014124179d0b (patch) | |
tree | 2603afb605e4e1949d3f850861b80b3f786b2194 | |
parent | 3370f203c3652ace357eeb69bb8828d830b3596a (diff) | |
download | sandcrawler-2db64d580e3b5995e328c719e2bd014124179d0b.tar.gz sandcrawler-2db64d580e3b5995e328c719e2bd014124179d0b.zip |
pdftrio fixes from testing
-rw-r--r-- | python/sandcrawler/pdftrio.py | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 5e4630b..168fb78 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -139,12 +139,12 @@ class PdfTrioWorker(SandcrawlerWorker): key=default_key, ) result = dict() - result['key'] = result['file_meta']['sha1hex'] result['file_meta'] = gen_file_metadata(blob) + result['key'] = result['file_meta']['sha1hex'] result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob) result['source'] = record result['timing'] = dict( - pdftrio_sec=result['pdf_trio'].pop('_total_sec'), + pdftrio_sec=result['pdf_trio'].pop('_total_sec', None), total_sec=time.time() - start_process, ) if wayback_sec: @@ -165,10 +165,16 @@ class PdfTrioBlobWorker(SandcrawlerWorker): self.sink = sink def process(self, blob): + start_process = time.time() if not blob: return None - result = self.pdftrio_client.classify_pdf(blob) + result = dict() result['file_meta'] = gen_file_metadata(blob) result['key'] = result['file_meta']['sha1hex'] + result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob) + result['timing'] = dict( + pdftrio_sec=result['pdf_trio'].pop('_total_sec', None), + total_sec=time.time() - start_process, + ) return result |