From 01a1978d5b9667df4ae71a7934512e6c4e3bf9a8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 1 Jun 2021 01:04:16 -0700 Subject: schema: add 'crossref' to bundle schema, and add from_json() helper from_json() refactor was an earlier TODO, to reduce duplication when updating fields on this class --- fatcat_scholar/schema.py | 21 ++++++++++++++++++++- fatcat_scholar/sim_pipeline.py | 1 + fatcat_scholar/transform.py | 30 ++++-------------------------- fatcat_scholar/work_pipeline.py | 1 + fatcat_scholar/worker.py | 15 ++------------- 5 files changed, 28 insertions(+), 40 deletions(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 9912a97..b8a1923 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -5,6 +5,7 @@ auto-conversion of datetime objects. """ import re +import json import datetime from enum import Enum from typing import Optional, List, Any, Dict @@ -18,7 +19,7 @@ from pydantic import BaseModel # pytype: enable=import-error from fatcat_openapi_client import ReleaseEntity, ReleaseContrib -from fatcat_scholar.api_entities import entity_to_dict +from fatcat_scholar.api_entities import entity_to_dict, entity_from_json from fatcat_scholar.biblio_hacks import doi_link_domain @@ -31,6 +32,7 @@ class IntermediateBundle(BaseModel): doc_type: DocType releases: List[ReleaseEntity] biblio_release_ident: Optional[str] + crossref: Optional[Dict[str, Any]] grobid_fulltext: Optional[Dict[str, Any]] pdftotext_fulltext: Optional[Dict[str, Any]] pdf_meta: Optional[Dict[str, Any]] @@ -45,6 +47,23 @@ class IntermediateBundle(BaseModel): datetime.datetime: lambda dt: dt.isoformat(), } + @classmethod + def from_json(cls, obj: Dict[Any, Any]) -> "IntermediateBundle": + return IntermediateBundle( + doc_type=DocType(obj.get("doc_type")), + releases=[ + entity_from_json(json.dumps(re), ReleaseEntity) + for re in obj.get("releases", []) + ], + biblio_release_ident=obj.get("biblio_release_ident"), + crossref=obj.get("crossref"), + grobid_fulltext=obj.get("grobid_fulltext"), + pdftotext_fulltext=obj.get("pdftotext_fulltext"), + pdf_meta=obj.get("pdf_meta"), + sim_fulltext=obj.get("sim_fulltext"), + html_fulltext=obj.get("html_fulltext"), + ) + class AccessType(str, Enum): ia_sim = "ia_sim" diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index e5e2a02..d602edf 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -122,6 +122,7 @@ class SimPipeline: doc_type=DocType.sim_page, releases=[], biblio_release_ident=None, + crossref=None, grobid_fulltext=None, pdftotext_fulltext=None, sim_fulltext=dict( diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 53f83ae..1c4b0e7 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -733,19 +733,8 @@ def run_transform(infile: Sequence) -> None: for line in infile: obj = json.loads(line) - heavy = IntermediateBundle( - doc_type=DocType(obj["doc_type"]), - releases=[ - entity_from_json(json.dumps(re), ReleaseEntity) - for re in obj["releases"] - ], - biblio_release_ident=obj.get("biblio_release_ident"), - grobid_fulltext=obj.get("grobid_fulltext"), - pdftotext_fulltext=obj.get("pdftotext_fulltext"), - pdf_meta=obj.get("pdf_meta"), - sim_fulltext=obj.get("sim_fulltext"), - html_fulltext=obj.get("html_fulltext"), - ) + heavy = IntermediateBundle.from_json(obj) + assert heavy.doc_type es_doc = transform_heavy(heavy) if not es_doc: continue @@ -756,19 +745,8 @@ def run_refs(infile: Sequence) -> None: for line in infile: obj = json.loads(line) - heavy = IntermediateBundle( - doc_type=DocType(obj["doc_type"]), - releases=[ - entity_from_json(json.dumps(re), ReleaseEntity) - for re in obj["releases"] - ], - biblio_release_ident=obj.get("biblio_release_ident"), - grobid_fulltext=obj.get("grobid_fulltext"), - pdftotext_fulltext=obj.get("pdftotext_fulltext"), - pdf_meta=obj.get("pdf_meta"), - sim_fulltext=obj.get("sim_fulltext"), - html_fulltext=obj.get("html_fulltext"), - ) + heavy = IntermediateBundle.from_json(obj) + assert heavy.doc_type refs = refs_from_heavy(heavy) for ref in refs: print(ref.json(exclude_none=True, sort_keys=True)) diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 4c8f1be..7b477a0 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -389,6 +389,7 @@ class WorkPipeline: doc_type=DocType.work, releases=releases, biblio_release_ident=pref_idents[0], + crossref=biblio_crossref, grobid_fulltext=grobid_fulltext, pdftotext_fulltext=pdftotext_fulltext, pdf_meta=pdf_meta, diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py index 823f1bd..7787d42 100644 --- a/fatcat_scholar/worker.py +++ b/fatcat_scholar/worker.py @@ -123,19 +123,8 @@ class IndexDocsWorker(KafkaWorker): bulk_actions = [] for obj in batch: - bundle = IntermediateBundle( - doc_type=DocType(obj["doc_type"]), - releases=[ - entity_from_json(json.dumps(re), ReleaseEntity) - for re in obj["releases"] - ], - biblio_release_ident=obj.get("biblio_release_ident"), - grobid_fulltext=obj.get("grobid_fulltext"), - pdftotext_fulltext=obj.get("pdftotext_fulltext"), - pdf_meta=obj.get("pdf_meta"), - html_fulltext=obj.get("html_fulltext"), - sim_fulltext=obj.get("sim_fulltext"), - ) + bundle = IntermediateBundle.from_json(obj) + assert bundle.get('doc_type') es_doc = transform_heavy(bundle) if not es_doc: continue -- cgit v1.2.3