diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 23:37:50 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-04 09:04:47 -0800 |
commit | a87ca1de1d8b31c4fbf9fddead27cdc58b09565a (patch) | |
tree | 575868d7d9c0d3d28a37d288d5e4975e57c8eaab /python/sandcrawler/html_ingest.py | |
parent | 8f964b9b48572ac71f27ba64207816dfd3a6dc36 (diff) | |
download | sandcrawler-a87ca1de1d8b31c4fbf9fddead27cdc58b09565a.tar.gz sandcrawler-a87ca1de1d8b31c4fbf9fddead27cdc58b09565a.zip |
initial implementation of HTML ingest in existing worker
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r-- | python/sandcrawler/html_ingest.py | 29 |
1 files changed, 22 insertions, 7 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index fe883ba..11909e6 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -76,21 +76,36 @@ class IngestWebResult(pydantic.BaseModel): datetime.datetime: lambda dt: dt.isoformat(), } +class HtmlMetaRow(pydantic.BaseModel): + sha1hex: str + status: str + scope: Optional[str] + has_teixml: bool + has_thumbnail: bool + word_count: Optional[int] + biblio: Optional[dict] + resources: Optional[List[dict]] + + class Config: + arbitrary_types_allowed = True + json_encoders = { + datetime.datetime: lambda dt: dt.isoformat(), + } + def to_sql_tuple(self) -> Tuple: """ This is for the html_meta SQL table. """ - assert self.file_meta and "sha1hex" in self.file_meta return ( - self.file_meta["sha1hex"], + self.sha1hex, datetime.datetime.now(), # updated self.status, self.scope, - bool(self.html_body and self.html_body['status'] == 'success' and self.html_body['tei_xml']), - False, # has_thumbnail - (self.html_body and self.html_body.get('word_count')) or None, - self.html_biblio, - self.html_resources, + self.has_teixml, + self.has_thumbnail, + self.word_count, + self.biblio and json.dumps(self.biblio, sort_keys=True), + self.resources and json.dumps(self.resources, sort_keys=True), ) |