aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 23:37:50 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-04 09:04:47 -0800
commita87ca1de1d8b31c4fbf9fddead27cdc58b09565a (patch)
tree575868d7d9c0d3d28a37d288d5e4975e57c8eaab /python/sandcrawler/html_ingest.py
parent8f964b9b48572ac71f27ba64207816dfd3a6dc36 (diff)
downloadsandcrawler-a87ca1de1d8b31c4fbf9fddead27cdc58b09565a.tar.gz
sandcrawler-a87ca1de1d8b31c4fbf9fddead27cdc58b09565a.zip
initial implementation of HTML ingest in existing worker
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r--python/sandcrawler/html_ingest.py29
1 files changed, 22 insertions, 7 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index fe883ba..11909e6 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -76,21 +76,36 @@ class IngestWebResult(pydantic.BaseModel):
datetime.datetime: lambda dt: dt.isoformat(),
}
+class HtmlMetaRow(pydantic.BaseModel):
+ sha1hex: str
+ status: str
+ scope: Optional[str]
+ has_teixml: bool
+ has_thumbnail: bool
+ word_count: Optional[int]
+ biblio: Optional[dict]
+ resources: Optional[List[dict]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
def to_sql_tuple(self) -> Tuple:
"""
This is for the html_meta SQL table.
"""
- assert self.file_meta and "sha1hex" in self.file_meta
return (
- self.file_meta["sha1hex"],
+ self.sha1hex,
datetime.datetime.now(), # updated
self.status,
self.scope,
- bool(self.html_body and self.html_body['status'] == 'success' and self.html_body['tei_xml']),
- False, # has_thumbnail
- (self.html_body and self.html_body.get('word_count')) or None,
- self.html_biblio,
- self.html_resources,
+ self.has_teixml,
+ self.has_thumbnail,
+ self.word_count,
+ self.biblio and json.dumps(self.biblio, sort_keys=True),
+ self.resources and json.dumps(self.resources, sort_keys=True),
)