aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/transform.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r--fatcat_scholar/transform.py142
1 files changed, 108 insertions, 34 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index f9616c4..3a7102a 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -483,7 +483,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
raise NotImplementedError(f"doc_type: {heavy.doc_type}")
# TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
- if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq':
+ if (
+ heavy.grobid_fulltext
+ and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq"
+ ):
fulltext_release = [
r
for r in heavy.releases
@@ -603,6 +606,55 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
)
+def clean_ref_key(key: Optional[str], doi: Optional[str] = None) -> Optional[str]:
+ if not key:
+ return None
+ key = key.strip()
+ if key and doi and key.startswith(doi):
+ key = key.replace(doi + "-", "")
+ key = key.replace(doi, "")
+ if key.startswith("10.") and "SICI" in key and "-" in key:
+ subkey = key.split("-")[-1]
+ if subkey:
+ key = subkey
+ if key.startswith("10.") and "_" in key:
+ subkey = key.split("_")[-1]
+ if subkey:
+ key = subkey
+ if len(key) > 10 and "#" in key:
+ subkey = key.split("#")[-1]
+ if subkey:
+ key = subkey
+ if len(key) > 10 and "_" in key:
+ subkey = key.split("_")[-1]
+ if subkey:
+ key = subkey
+ if key and key.startswith("ref-"):
+ key = key[4:]
+ if len(key) >= 2 and key[0] in ["/", "_"]:
+ key = key[1:]
+ if not key:
+ return None
+ return key
+
+
+def test_clean_ref_key() -> None:
+ test_pairs = [
+ ("ref-23", None, "23"),
+ ("_bib0040", None, "bib0040"),
+ (" 20170224012016_R15", None, "R15"),
+ (
+ "10.1002/(SICI)1099-1026(199905/06)14:3<195::AID-FFJ807>3.0.CO;2-C-BIB1",
+ None,
+ "BIB1",
+ ),
+ ("BFnrcardio201557_CR175", None, "CR175"),
+ ("2019121710443552100_", None, "2019121710443552100_"),
+ ]
+ for raw, doi, expected in test_pairs:
+ assert clean_ref_key(raw, doi=doi) == expected
+
+
def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]:
output = []
for ref in tei_dict.get("citations") or []:
@@ -619,6 +671,10 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
if a.get("name"):
assert isinstance(a["name"], str)
authors.append(a["name"])
+ ref_index = ref.get("index")
+ if ref_index is not None:
+ # transform from 0-indexed to 1-indexed
+ ref_index = ref_index + 1
output.append(
RefStructured(
biblio=RefBiblio(
@@ -636,15 +692,15 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
pmid=ref.get("pmid"),
pmcid=clean_pmcid(ref.get("pmcid")),
arxiv_id=ref.get("arxiv_id"),
- # isbn13: Optional[str]
+ isbn=ref.get("isbn"),
url=clean_url_conservative(ref.get("url")),
),
release_ident=release.ident,
work_ident=release.work_id,
release_stage=release.release_stage,
release_year=release.release_year,
- index=ref.get("index"),
- key=ref.get("id"),
+ index=ref_index,
+ key=clean_ref_key(ref.get("id")),
locator=None,
# target_release_id
ref_source="grobid",
@@ -658,14 +714,6 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
for ref in release.refs:
ref_source = "fatcat"
- key = ref.key
- if key and release.ext_ids.doi and key.startswith(release.ext_ids.doi):
- key = key.replace(release.ext_ids.doi, "")
- if key and key.startswith("ref-"):
- key = key[4:]
- if key and key.startswith("b"):
- key = key[1:]
-
if release.extra and release.extra.get("pubmed"):
ref_source = "fatcat-pubmed"
elif release.extra and release.extra.get("crossref"):
@@ -676,6 +724,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
extra = ref.extra or dict()
authors = extra.get("authors") or []
authors = [a for a in authors if type(a) == str]
+ ref_index = None
+ if ref.index is not None:
+ # transform from 0-indexed (release.refs) to 1-indexed (fatcat_refs)
+ ref_index = ref.index + 1
output.append(
RefStructured(
biblio=RefBiblio(
@@ -689,18 +741,19 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
volume=extra.get("volume"),
issue=extra.get("issue"),
pages=extra.get("pages") or extra.get("page"),
- doi=extra.get("doi"),
+ doi=clean_doi(extra.get("doi")),
pmid=extra.get("pmid"),
- pmcid=extra.get("pmcid"),
+ pmcid=clean_pmcid(extra.get("pmcid")),
arxiv_id=extra.get("arxiv_id"),
- isbn13=extra.get("isbn13"),
+ isbn=extra.get("isbn13") or extra.get("isbn"),
url=clean_url_conservative(extra.get("url")),
),
release_ident=release.ident,
work_ident=release.work_id,
+ release_stage=release.release_stage,
release_year=release.release_year,
- index=ref.index,
- key=key or None,
+ index=ref_index,
+ key=clean_ref_key(ref.key, doi=release.ext_ids.doi),
locator=ref.locator,
target_release_id=ref.target_release_id,
ref_source=ref_source,
@@ -724,26 +777,41 @@ def refs_from_crossref(
authors = [
ref["author"],
]
- key = ref.get("key")
- if key and key.startswith(record["DOI"]):
- key = key.replace(record["DOI"] + "-", "")
- key = key.replace(record["DOI"], "")
- if key and key.startswith("ref-"):
- key = key[4:]
+ ref_title = ref.get("article-title")
ref_container_name = ref.get("journal-title")
if not ref_container_name:
+ ref_container_name = ref.get("container-title")
+
+ # volume-title is often a book title
+ if not ref_title:
+ ref_title = ref.get("volume-title")
+ elif not ref_container_name:
ref_container_name = ref.get("volume-title")
+
+ # series-title is a bit weird in Crossref references. it is often
+ # passed alone and seems to be the article/book title miscategorized.
+ # other times it is a conference name.
+ series_title = ref.get("series-title")
+ if not ref_title:
+ ref_title = series_title
+ elif not ref_container_name:
+ ref_container_name = series_title
+
+ year = ref.get("year")
+ if year:
+ year = clean_small_int(year)
+ else:
+ year = None
date = ref.get("date")
- year = None
- if date and len(date) >= 4 and date[:4].isdigit():
+ if date and not year and len(date) >= 4 and date[:4].isdigit():
year = int(date[:4])
- if year < 1000 or year > 2100:
- year = None
+ if year and (year < 1000 or year > 2100):
+ year = None
output.append(
RefStructured(
biblio=RefBiblio(
unstructured=ref.get("unstructured"),
- title=ref.get("article-title"),
+ title=ref_title,
subtitle=ref.get("subtitle"),
contrib_raw_names=authors,
year=year,
@@ -751,15 +819,18 @@ def refs_from_crossref(
publisher=ref.get("publisher"),
volume=ref.get("volume"),
issue=ref.get("issue"),
- pages=ref.get("page"),
- doi=ref.get("DOI"),
+ pages=ref.get("first-page"),
+ version=ref.get("edition"),
+ doi=clean_doi(ref.get("DOI")),
+ isbn=ref.get("ISBN"),
),
release_ident=release.ident,
work_ident=release.work_id,
+ release_stage=release.release_stage,
release_year=release.release_year,
- index=i,
- key=key or None,
- locator=ref.get("first-page"),
+ index=i + 1, # 1-indexed
+ key=clean_ref_key(ref.get("key"), doi=record.get("DOI")),
+ # locator,
target_release_id=None,
ref_source=ref_source,
)
@@ -795,7 +866,10 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
fulltext_refs: List[RefStructured] = []
# TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
- if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq':
+ if (
+ heavy.grobid_fulltext
+ and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq"
+ ):
fulltext_release = [
r
for r in heavy.releases