From a87ca1de1d8b31c4fbf9fddead27cdc58b09565a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 23:37:50 -0800 Subject: initial implementation of HTML ingest in existing worker --- python/sandcrawler/html_metadata.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'python/sandcrawler/html_metadata.py') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 3ebba57..8928978 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -248,6 +248,11 @@ class BiblioMetadata(pydantic.BaseModel): html_fulltext_url: Optional[str] xml_fulltext_url: Optional[str] + class Config: + json_encoders = { + datetime.date: lambda dt: dt.isoformat() + } + def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]: """ -- cgit v1.2.3