diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 23:37:50 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-04 09:04:47 -0800 |
commit | a87ca1de1d8b31c4fbf9fddead27cdc58b09565a (patch) | |
tree | 575868d7d9c0d3d28a37d288d5e4975e57c8eaab /python/sandcrawler/persist.py | |
parent | 8f964b9b48572ac71f27ba64207816dfd3a6dc36 (diff) | |
download | sandcrawler-a87ca1de1d8b31c4fbf9fddead27cdc58b09565a.tar.gz sandcrawler-a87ca1de1d8b31c4fbf9fddead27cdc58b09565a.zip |
initial implementation of HTML ingest in existing worker
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r-- | python/sandcrawler/persist.py | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index c225d5a..033bc91 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -28,6 +28,7 @@ from sandcrawler.db import SandcrawlerPostgresClient from sandcrawler.minio import SandcrawlerMinioClient from sandcrawler.grobid import GrobidClient from sandcrawler.pdfextract import PdfExtractResult +from sandcrawler.html_ingest import HtmlMetaRow class PersistCdxWorker(SandcrawlerWorker): @@ -159,8 +160,21 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): result['terminal_sha1hex'] = terminal.get('terminal_sha1hex') return result - def result_to_html_meta(self, record: dict) -> Optional[dict]: - raise NotImplementedError() + def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]: + html_body = record.get('html_body') + file_meta = record.get('file_meta') + if not (file_meta and html_body): + return None + return HtmlMetaRow( + sha1hex=file_meta["sha1hex"], + status=record.get('status'), + scope=record.get('scope'), + has_teixml=bool(html_body and html_body['status'] == 'success' and html_body.get('tei_xml')), + has_thumbnail=False, # TODO + word_count=(html_body and html_body.get('word_count')) or None, + biblio=record.get('html_biblio'), + resources=record.get('html_resources'), + ) def push_batch(self, batch): self.counts['total'] += len(batch) @@ -200,7 +214,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): self.counts['insert-file_meta'] += resp[0] self.counts['update-file_meta'] += resp[1] - html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_meta')] + html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body')] if html_meta_batch: resp = self.db.insert_html_meta(self.cur, html_meta_batch, on_conflict="nothing") self.counts['insert-html_meta'] += resp[0] |