diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-13 23:37:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-13 23:37:09 -0700 |
commit | 774ac98c2ca0a1c66c3283d466245cc487d602d3 (patch) | |
tree | 93add99bf9d4467ebb59797e99eb8abb4a62ec6d /fatcat_scholar/transform.py | |
parent | 2e659b6bad1ab429d36fcd8cb1a686eab81e6d89 (diff) | |
download | fatcat-scholar-774ac98c2ca0a1c66c3283d466245cc487d602d3.tar.gz fatcat-scholar-774ac98c2ca0a1c66c3283d466245cc487d602d3.zip |
refs transform: both GROBID and fatcat refs
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r-- | fatcat_scholar/transform.py | 17 |
1 files changed, 12 insertions, 5 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index b4b5c8d..50b6810 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -536,6 +536,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> Sequence[RefStructured]: def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: + """ + Current behavior is to return *both* fatcat refs and GROBID refs if + available. + """ if heavy.doc_type != DocType.work: return [] @@ -546,19 +550,22 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: r for r in heavy.releases if r.ident == heavy.biblio_release_ident ][0] + refs: List[RefStructured] = [] + if primary_release.refs: # TODO: what about other releases? - return refs_from_release_refs(primary_release) - elif heavy.grobid_fulltext: + refs.extend(refs_from_release_refs(primary_release)) + + if heavy.grobid_fulltext: fulltext_release = [ r for r in heavy.releases if r.ident == heavy.grobid_fulltext["release_ident"] ][0] tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"]) - return refs_from_grobid(fulltext_release, tei_dict) - else: - return [] + refs.extend(refs_from_grobid(fulltext_release, tei_dict)) + + return refs def run_transform(infile: Sequence) -> None: |