diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 13:27:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 13:27:55 -0700 |
commit | f2c465fffc76ca752249e11d32673db43efc35f1 (patch) | |
tree | e000389c916c56c322a984ebdf440a2c6129a0e0 /fatcat_scholar/work_pipeline.py | |
parent | 3ee18580dd108c69c01cdf838a7f1a7d3d181629 (diff) | |
download | fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.tar.gz fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.zip |
first pass transform from pipelines to ES schema
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 17 |
1 files changed, 1 insertions, 16 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index b9dcbe8..081878c 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -12,7 +12,7 @@ from fatcat_scholar.api_entities import * from fatcat_scholar.djvu import djvu_extract_leaf_texts from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient from fatcat_scholar.issue_db import IssueDB, SimIssueRow -from fatcat_scholar.es_transform import es_biblio_from_release, es_release_from_release, DocType +from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]: @@ -44,21 +44,6 @@ def test_parse_pages(): assert parse_pages("iiv") == (None, None) -class IntermediateBundle(BaseModel): - doc_type: DocType - releases: List[ReleaseEntity] - biblio_release_ident: Optional[str] - grobid_fulltext: Optional[Any] - pdftotext_fulltext: Optional[Any] - sim_fulltext: Optional[Any] - - class Config: - arbitrary_types_allowed = True - json_encoders = { - ReleaseEntity: lambda re: entity_to_dict(re), - } - - def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]: """ Returns a list of release idents in preference order (best first) to |