aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/transform.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-25 15:09:27 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-25 15:10:56 -0700
commitc613297265f101e31fe915ac74bc80b43f6ffe00 (patch)
treec870935e2784830ebae03f8412066c70cdc8521a /fatcat_scholar/transform.py
parent88fc23865de8cf8126cc4c6f0c14a3825d85e3c0 (diff)
downloadfatcat-scholar-c613297265f101e31fe915ac74bc80b43f6ffe00.tar.gz
fatcat-scholar-c613297265f101e31fe915ac74bc80b43f6ffe00.zip
refs transform: many fixes
- include year correctly (many cases) - test coverage for Crossref transform - pass-through 'edition' as 'version' - series-title parsed in to title or container as appropriate - missing release stage - fix 0-index vs. 1-index ref index field
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r--fatcat_scholar/transform.py43
1 files changed, 34 insertions, 9 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 22e2e8f..508b898 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -640,7 +640,7 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
pmid=ref.get("pmid"),
pmcid=clean_pmcid(ref.get("pmcid")),
arxiv_id=ref.get("arxiv_id"),
- # isbn13: Optional[str]
+ isbn=ref.get("isbn"),
url=clean_url_conservative(ref.get("url")),
),
release_ident=release.ident,
@@ -701,11 +701,12 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
pmid=extra.get("pmid"),
pmcid=extra.get("pmcid"),
arxiv_id=extra.get("arxiv_id"),
- isbn13=extra.get("isbn13"),
+ isbn=extra.get("isbn13") or extra.get("isbn"),
url=clean_url_conservative(extra.get("url")),
),
release_ident=release.ident,
work_ident=release.work_id,
+ release_stage=release.release_stage,
release_year=release.release_year,
index=ref_index,
key=key or None,
@@ -738,20 +739,41 @@ def refs_from_crossref(
key = key.replace(record["DOI"], "")
if key and key.startswith("ref-"):
key = key[4:]
+ ref_title = ref.get("article-title")
ref_container_name = ref.get("journal-title")
if not ref_container_name:
+ ref_container_name = ref.get("container-title")
+
+ # volume-title is often a book title
+ if not ref_title:
+ ref_title = ref.get("volume-title")
+ elif not ref_container_name:
ref_container_name = ref.get("volume-title")
+
+ # series-title is a bit weird in Crossref references. it is often
+ # passed alone and seems to be the article/book title miscategorized.
+ # other times it is a conference name.
+ series_title = ref.get("series-title")
+ if not ref_title:
+ ref_title = series_title
+ elif not ref_container_name:
+ ref_container_name = series_title
+
+ year = ref.get("year")
+ if year and year.isdigit():
+ year = int(year)
+ else:
+ year = None
date = ref.get("date")
- year = None
- if date and len(date) >= 4 and date[:4].isdigit():
+ if date and not year and len(date) >= 4 and date[:4].isdigit():
year = int(date[:4])
- if year < 1000 or year > 2100:
- year = None
+ if year and (year < 1000 or year > 2100):
+ year = None
output.append(
RefStructured(
biblio=RefBiblio(
unstructured=ref.get("unstructured"),
- title=ref.get("article-title"),
+ title=ref_title,
subtitle=ref.get("subtitle"),
contrib_raw_names=authors,
year=year,
@@ -759,15 +781,18 @@ def refs_from_crossref(
publisher=ref.get("publisher"),
volume=ref.get("volume"),
issue=ref.get("issue"),
- pages=ref.get("page"),
+ pages=ref.get("first-page"),
+ version=ref.get("edition"),
doi=ref.get("DOI"),
+ isbn=ref.get("ISBN"),
),
release_ident=release.ident,
work_ident=release.work_id,
+ release_stage=release.release_stage,
release_year=release.release_year,
index=i + 1, # 1-indexed
key=key or None,
- locator=ref.get("first-page"),
+ #locator,
target_release_id=None,
ref_source=ref_source,
)