diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-06-01 01:04:16 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-06-02 00:38:51 -0700 |
commit | 01a1978d5b9667df4ae71a7934512e6c4e3bf9a8 (patch) | |
tree | d9d0e87e0c1bdb842696018174db4f432bab20c9 /fatcat_scholar/transform.py | |
parent | 86b29ed5fca70fc0c52443acf6a5ec1a398ed3f6 (diff) | |
download | fatcat-scholar-01a1978d5b9667df4ae71a7934512e6c4e3bf9a8.tar.gz fatcat-scholar-01a1978d5b9667df4ae71a7934512e6c4e3bf9a8.zip |
schema: add 'crossref' to bundle schema, and add from_json() helper
from_json() refactor was an earlier TODO, to reduce duplication when
updating fields on this class
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r-- | fatcat_scholar/transform.py | 30 |
1 files changed, 4 insertions, 26 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 53f83ae..1c4b0e7 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -733,19 +733,8 @@ def run_transform(infile: Sequence) -> None: for line in infile: obj = json.loads(line) - heavy = IntermediateBundle( - doc_type=DocType(obj["doc_type"]), - releases=[ - entity_from_json(json.dumps(re), ReleaseEntity) - for re in obj["releases"] - ], - biblio_release_ident=obj.get("biblio_release_ident"), - grobid_fulltext=obj.get("grobid_fulltext"), - pdftotext_fulltext=obj.get("pdftotext_fulltext"), - pdf_meta=obj.get("pdf_meta"), - sim_fulltext=obj.get("sim_fulltext"), - html_fulltext=obj.get("html_fulltext"), - ) + heavy = IntermediateBundle.from_json(obj) + assert heavy.doc_type es_doc = transform_heavy(heavy) if not es_doc: continue @@ -756,19 +745,8 @@ def run_refs(infile: Sequence) -> None: for line in infile: obj = json.loads(line) - heavy = IntermediateBundle( - doc_type=DocType(obj["doc_type"]), - releases=[ - entity_from_json(json.dumps(re), ReleaseEntity) - for re in obj["releases"] - ], - biblio_release_ident=obj.get("biblio_release_ident"), - grobid_fulltext=obj.get("grobid_fulltext"), - pdftotext_fulltext=obj.get("pdftotext_fulltext"), - pdf_meta=obj.get("pdf_meta"), - sim_fulltext=obj.get("sim_fulltext"), - html_fulltext=obj.get("html_fulltext"), - ) + heavy = IntermediateBundle.from_json(obj) + assert heavy.doc_type refs = refs_from_heavy(heavy) for ref in refs: print(ref.json(exclude_none=True, sort_keys=True)) |