diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-25 13:10:10 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-25 13:11:49 -0700 |
commit | 528804ad2e55983cf3e5e6659d8f46db0cab02b7 (patch) | |
tree | 801df5008ca69ff2c9de17025f238e32fbff61b5 /fatcat_scholar | |
parent | 4b970481d9df4c495fb1df24238df7afbd52cf65 (diff) | |
download | fatcat-scholar-528804ad2e55983cf3e5e6659d8f46db0cab02b7.tar.gz fatcat-scholar-528804ad2e55983cf3e5e6659d8f46db0cab02b7.zip |
refs transform: 1-index refs.index, not 0-index
This was not matching expectations/schema of downstream refs pipeline
(cgraph), and wasn't matching documented schema.
Note care required when checking if the index is set, to distinguish
between '0' and 'None' values.
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/schema.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 14 |
2 files changed, 12 insertions, 4 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index e6d0422..cae8c12 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -284,7 +284,7 @@ class RefStructured(BaseModel): work_ident: Optional[str] release_stage: Optional[str] release_year: Optional[int] - index: Optional[int] + index: Optional[int] # 1-indexed key: Optional[str] locator: Optional[str] target_release_id: Optional[str] diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index f9616c4..22e2e8f 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -619,6 +619,10 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur if a.get("name"): assert isinstance(a["name"], str) authors.append(a["name"]) + ref_index = ref.get("index") + if ref_index is not None: + # transform from 0-indexed to 1-indexed + ref_index = ref_index + 1 output.append( RefStructured( biblio=RefBiblio( @@ -643,7 +647,7 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur work_ident=release.work_id, release_stage=release.release_stage, release_year=release.release_year, - index=ref.get("index"), + index=ref_index, key=ref.get("id"), locator=None, # target_release_id @@ -676,6 +680,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: extra = ref.extra or dict() authors = extra.get("authors") or [] authors = [a for a in authors if type(a) == str] + ref_index = None + if ref.index is not None: + # transform from 0-indexed (release.refs) to 1-indexed (fatcat_refs) + ref_index = ref.index + 1 output.append( RefStructured( biblio=RefBiblio( @@ -699,7 +707,7 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: release_ident=release.ident, work_ident=release.work_id, release_year=release.release_year, - index=ref.index, + index=ref_index, key=key or None, locator=ref.locator, target_release_id=ref.target_release_id, @@ -757,7 +765,7 @@ def refs_from_crossref( release_ident=release.ident, work_ident=release.work_id, release_year=release.release_year, - index=i, + index=i + 1, # 1-indexed key=key or None, locator=ref.get("first-page"), target_release_id=None, |