aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_metadata.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 23:37:50 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-04 09:04:47 -0800
commita87ca1de1d8b31c4fbf9fddead27cdc58b09565a (patch)
tree575868d7d9c0d3d28a37d288d5e4975e57c8eaab /python/sandcrawler/html_metadata.py
parent8f964b9b48572ac71f27ba64207816dfd3a6dc36 (diff)
downloadsandcrawler-a87ca1de1d8b31c4fbf9fddead27cdc58b09565a.tar.gz
sandcrawler-a87ca1de1d8b31c4fbf9fddead27cdc58b09565a.zip
initial implementation of HTML ingest in existing worker
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r--python/sandcrawler/html_metadata.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 3ebba57..8928978 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -248,6 +248,11 @@ class BiblioMetadata(pydantic.BaseModel):
html_fulltext_url: Optional[str]
xml_fulltext_url: Optional[str]
+ class Config:
+ json_encoders = {
+ datetime.date: lambda dt: dt.isoformat()
+ }
+
def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
"""