diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 18:18:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 18:18:19 -0700 |
commit | 6f382a4c07ccac68896f75d55835a8876981edbd (patch) | |
tree | a8c998a5183cb40e7f3a89c793db09731d8ac5db /python/sandcrawler/pdfextract.py | |
parent | 585e0ba76687ae2872faed88bd1edc5b051136ca (diff) | |
download | sandcrawler-6f382a4c07ccac68896f75d55835a8876981edbd.tar.gz sandcrawler-6f382a4c07ccac68896f75d55835a8876981edbd.zip |
pdfextract support in ingest worker
Diffstat (limited to 'python/sandcrawler/pdfextract.py')
-rw-r--r-- | python/sandcrawler/pdfextract.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 6a78a0a..97c2f3b 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -69,6 +69,30 @@ class PdfExtractResult: pdf_extra=record.get('pdf_extra'), ) + @classmethod + def from_pdf_meta_dict(cls, record): + """ + Parses what would be returned from postgrest + """ + if record['status'] != 'success': + return PdfExtractResult( + sha1hex=record['sha1hex'], + status=record['status'], + error_msg=record.get('metadata', {}).get('error_msg'), + ) + else: + pdf_extra = dict() + for k in ('page_count', 'page0_height', 'page0_width', 'permanent_id', 'pdf_version'): + if record.get(k): + pdf_extra[k] = record[k] + return PdfExtractResult( + sha1hex=record['sha1hex'], + status=record['status'], + has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)), + pdf_info=record.get('metadata'), + pdf_extra=pdf_extra, + ) + def to_sql_tuple(self) -> tuple: # pdf_meta (sha1hex, updated, status, page0_thumbnail, page_count, # word_count, page0_height, page0_width, permanent_id, pdf_created, |