diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-25 13:10:10 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-25 13:11:49 -0700 |
commit | 528804ad2e55983cf3e5e6659d8f46db0cab02b7 (patch) | |
tree | 801df5008ca69ff2c9de17025f238e32fbff61b5 | |
parent | 4b970481d9df4c495fb1df24238df7afbd52cf65 (diff) | |
download | fatcat-scholar-528804ad2e55983cf3e5e6659d8f46db0cab02b7.tar.gz fatcat-scholar-528804ad2e55983cf3e5e6659d8f46db0cab02b7.zip |
refs transform: 1-index refs.index, not 0-index
This was not matching expectations/schema of downstream refs pipeline
(cgraph), and wasn't matching documented schema.
Note care required when checking if the index is set, to distinguish
between '0' and 'None' values.
-rw-r--r-- | fatcat_scholar/schema.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 14 | ||||
-rw-r--r-- | tests/test_refs_transform.py | 2 |
3 files changed, 13 insertions, 5 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index e6d0422..cae8c12 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -284,7 +284,7 @@ class RefStructured(BaseModel): work_ident: Optional[str] release_stage: Optional[str] release_year: Optional[int] - index: Optional[int] + index: Optional[int] # 1-indexed key: Optional[str] locator: Optional[str] target_release_id: Optional[str] diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index f9616c4..22e2e8f 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -619,6 +619,10 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur if a.get("name"): assert isinstance(a["name"], str) authors.append(a["name"]) + ref_index = ref.get("index") + if ref_index is not None: + # transform from 0-indexed to 1-indexed + ref_index = ref_index + 1 output.append( RefStructured( biblio=RefBiblio( @@ -643,7 +647,7 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur work_ident=release.work_id, release_stage=release.release_stage, release_year=release.release_year, - index=ref.get("index"), + index=ref_index, key=ref.get("id"), locator=None, # target_release_id @@ -676,6 +680,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: extra = ref.extra or dict() authors = extra.get("authors") or [] authors = [a for a in authors if type(a) == str] + ref_index = None + if ref.index is not None: + # transform from 0-indexed (release.refs) to 1-indexed (fatcat_refs) + ref_index = ref.index + 1 output.append( RefStructured( biblio=RefBiblio( @@ -699,7 +707,7 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: release_ident=release.ident, work_ident=release.work_id, release_year=release.release_year, - index=ref.index, + index=ref_index, key=key or None, locator=ref.locator, target_release_id=ref.target_release_id, @@ -757,7 +765,7 @@ def refs_from_crossref( release_ident=release.ident, work_ident=release.work_id, release_year=release.release_year, - index=i, + index=i + 1, # 1-indexed key=key or None, locator=ref.get("first-page"), target_release_id=None, diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py index 3fa490b..5b48396 100644 --- a/tests/test_refs_transform.py +++ b/tests/test_refs_transform.py @@ -27,7 +27,7 @@ def test_transform_refs_grobid() -> None: assert ref.release_year == 1234 assert ref.ref_source == "grobid" assert ref.key == "b12" - assert ref.index == 12 + assert ref.index == 13 assert ref.locator == None assert ref.biblio.contrib_raw_names is not None assert ref.biblio.contrib_raw_names[0] == "K Tasa" |