diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 13:27:55 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 13:27:55 -0700 | 
| commit | f2c465fffc76ca752249e11d32673db43efc35f1 (patch) | |
| tree | e000389c916c56c322a984ebdf440a2c6129a0e0 /fatcat_scholar/work_pipeline.py | |
| parent | 3ee18580dd108c69c01cdf838a7f1a7d3d181629 (diff) | |
| download | fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.tar.gz fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.zip | |
first pass transform from pipelines to ES schema
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
| -rw-r--r-- | fatcat_scholar/work_pipeline.py | 17 | 
1 files changed, 1 insertions, 16 deletions
| diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index b9dcbe8..081878c 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -12,7 +12,7 @@ from fatcat_scholar.api_entities import *  from fatcat_scholar.djvu import djvu_extract_leaf_texts  from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient  from fatcat_scholar.issue_db import IssueDB, SimIssueRow -from fatcat_scholar.es_transform import es_biblio_from_release, es_release_from_release, DocType +from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle  def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]: @@ -44,21 +44,6 @@ def test_parse_pages():      assert parse_pages("iiv") == (None, None) -class IntermediateBundle(BaseModel): -    doc_type: DocType -    releases: List[ReleaseEntity] -    biblio_release_ident: Optional[str] -    grobid_fulltext: Optional[Any] -    pdftotext_fulltext: Optional[Any] -    sim_fulltext: Optional[Any] - -    class Config: -        arbitrary_types_allowed = True -        json_encoders = { -            ReleaseEntity: lambda re: entity_to_dict(re), -        } - -  def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]:      """      Returns a list of release idents in preference order (best first) to | 
