aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 14:57:03 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 14:57:03 -0700
commitd14715279f03fae6cf0fe1998d7845e3a245e86e (patch)
tree1baeedf60e3ed5fcf2eaca50334fa10f7ca9a53b
parentf651f171499d011a83a247eefb23c4e8a789b1c8 (diff)
downloadsandcrawler-d14715279f03fae6cf0fe1998d7845e3a245e86e.tar.gz
sandcrawler-d14715279f03fae6cf0fe1998d7845e3a245e86e.zip
ensure pdf_meta isn't passed an empty dict()
-rw-r--r--python/sandcrawler/pdfextract.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 5ef5dfd..301754a 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -87,6 +87,9 @@ class PdfExtractResult:
metadata[k.lower()] = self.pdf_info[k]
if 'CreationDate' in self.pdf_info:
pdf_created = self.pdf_info['CreationDate']
+ metadata_json: Optional[str] = None
+ if metadata:
+ metadata_json = json.dumps(metadata, sort_keys=True)
return (
self.sha1hex,
datetime.datetime.now(), # updated
@@ -99,7 +102,7 @@ class PdfExtractResult:
pdf_extra.get('permanent_id'),
pdf_created,
pdf_extra.get('pdf_version'),
- metadata and json.dumps(metadata, sort_keys=True),
+ metadata_json,
)