diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 17:05:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 17:05:35 -0700 |
commit | c4f5ba60cf3581dc58875a4e56f8964560496753 (patch) | |
tree | 43795fa1223961f28f2fac52a29e5e155a85f77f /fatcat_scholar/schema.py | |
parent | 0f8b248259b4b57e425f7420883cb141565b2b22 (diff) | |
download | fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.tar.gz fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.zip |
fetch pdftotext and pdf_meta from blobs, postgrest
This replaces the temporary COVID-19 content hack with production
content (text, thumbnail URLs) stored in postgrest and seaweedfs.
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r-- | fatcat_scholar/schema.py | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index d74f018..cf88011 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -7,7 +7,7 @@ auto-conversion of datetime objects. import re import datetime from enum import Enum -from typing import Optional, List, Any +from typing import Optional, List, Any, Dict import ftfy from bs4 import BeautifulSoup @@ -30,9 +30,10 @@ class IntermediateBundle(BaseModel): doc_type: DocType releases: List[ReleaseEntity] biblio_release_ident: Optional[str] - grobid_fulltext: Optional[Any] - pdftotext_fulltext: Optional[Any] - sim_fulltext: Optional[Any] + grobid_fulltext: Optional[Dict[str, Any]] + pdftotext_fulltext: Optional[Dict[str, Any]] + pdf_meta: Optional[Dict[str, Any]] + sim_fulltext: Optional[Dict[str, Any]] class Config: arbitrary_types_allowed = True |