aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-25 13:10:10 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-25 13:11:49 -0700
commit528804ad2e55983cf3e5e6659d8f46db0cab02b7 (patch)
tree801df5008ca69ff2c9de17025f238e32fbff61b5 /fatcat_scholar
parent4b970481d9df4c495fb1df24238df7afbd52cf65 (diff)
downloadfatcat-scholar-528804ad2e55983cf3e5e6659d8f46db0cab02b7.tar.gz
fatcat-scholar-528804ad2e55983cf3e5e6659d8f46db0cab02b7.zip
refs transform: 1-index refs.index, not 0-index
This was not matching expectations/schema of downstream refs pipeline (cgraph), and wasn't matching documented schema. Note care required when checking if the index is set, to distinguish between '0' and 'None' values.
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/schema.py2
-rw-r--r--fatcat_scholar/transform.py14
2 files changed, 12 insertions, 4 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index e6d0422..cae8c12 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -284,7 +284,7 @@ class RefStructured(BaseModel):
work_ident: Optional[str]
release_stage: Optional[str]
release_year: Optional[int]
- index: Optional[int]
+ index: Optional[int] # 1-indexed
key: Optional[str]
locator: Optional[str]
target_release_id: Optional[str]
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index f9616c4..22e2e8f 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -619,6 +619,10 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
if a.get("name"):
assert isinstance(a["name"], str)
authors.append(a["name"])
+ ref_index = ref.get("index")
+ if ref_index is not None:
+ # transform from 0-indexed to 1-indexed
+ ref_index = ref_index + 1
output.append(
RefStructured(
biblio=RefBiblio(
@@ -643,7 +647,7 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
work_ident=release.work_id,
release_stage=release.release_stage,
release_year=release.release_year,
- index=ref.get("index"),
+ index=ref_index,
key=ref.get("id"),
locator=None,
# target_release_id
@@ -676,6 +680,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
extra = ref.extra or dict()
authors = extra.get("authors") or []
authors = [a for a in authors if type(a) == str]
+ ref_index = None
+ if ref.index is not None:
+ # transform from 0-indexed (release.refs) to 1-indexed (fatcat_refs)
+ ref_index = ref.index + 1
output.append(
RefStructured(
biblio=RefBiblio(
@@ -699,7 +707,7 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
release_ident=release.ident,
work_ident=release.work_id,
release_year=release.release_year,
- index=ref.index,
+ index=ref_index,
key=key or None,
locator=ref.locator,
target_release_id=ref.target_release_id,
@@ -757,7 +765,7 @@ def refs_from_crossref(
release_ident=release.ident,
work_ident=release.work_id,
release_year=release.release_year,
- index=i,
+ index=i + 1, # 1-indexed
key=key or None,
locator=ref.get("first-page"),
target_release_id=None,