summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/schema.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-29 17:05:32 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-29 17:05:35 -0700
commitc4f5ba60cf3581dc58875a4e56f8964560496753 (patch)
tree43795fa1223961f28f2fac52a29e5e155a85f77f /fatcat_scholar/schema.py
parent0f8b248259b4b57e425f7420883cb141565b2b22 (diff)
downloadfatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.tar.gz
fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.zip
fetch pdftotext and pdf_meta from blobs, postgrest
This replaces the temporary COVID-19 content hack with production content (text, thumbnail URLs) stored in postgrest and seaweedfs.
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r--fatcat_scholar/schema.py9
1 files changed, 5 insertions, 4 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index d74f018..cf88011 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -7,7 +7,7 @@ auto-conversion of datetime objects.
import re
import datetime
from enum import Enum
-from typing import Optional, List, Any
+from typing import Optional, List, Any, Dict
import ftfy
from bs4 import BeautifulSoup
@@ -30,9 +30,10 @@ class IntermediateBundle(BaseModel):
doc_type: DocType
releases: List[ReleaseEntity]
biblio_release_ident: Optional[str]
- grobid_fulltext: Optional[Any]
- pdftotext_fulltext: Optional[Any]
- sim_fulltext: Optional[Any]
+ grobid_fulltext: Optional[Dict[str, Any]]
+ pdftotext_fulltext: Optional[Dict[str, Any]]
+ pdf_meta: Optional[Dict[str, Any]]
+ sim_fulltext: Optional[Dict[str, Any]]
class Config:
arbitrary_types_allowed = True