diff options
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r-- | fatcat_scholar/transform.py | 142 |
1 files changed, 108 insertions, 34 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index f9616c4..3a7102a 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -483,7 +483,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: raise NotImplementedError(f"doc_type: {heavy.doc_type}") # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+ - if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq': + if ( + heavy.grobid_fulltext + and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq" + ): fulltext_release = [ r for r in heavy.releases @@ -603,6 +606,55 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: ) +def clean_ref_key(key: Optional[str], doi: Optional[str] = None) -> Optional[str]: + if not key: + return None + key = key.strip() + if key and doi and key.startswith(doi): + key = key.replace(doi + "-", "") + key = key.replace(doi, "") + if key.startswith("10.") and "SICI" in key and "-" in key: + subkey = key.split("-")[-1] + if subkey: + key = subkey + if key.startswith("10.") and "_" in key: + subkey = key.split("_")[-1] + if subkey: + key = subkey + if len(key) > 10 and "#" in key: + subkey = key.split("#")[-1] + if subkey: + key = subkey + if len(key) > 10 and "_" in key: + subkey = key.split("_")[-1] + if subkey: + key = subkey + if key and key.startswith("ref-"): + key = key[4:] + if len(key) >= 2 and key[0] in ["/", "_"]: + key = key[1:] + if not key: + return None + return key + + +def test_clean_ref_key() -> None: + test_pairs = [ + ("ref-23", None, "23"), + ("_bib0040", None, "bib0040"), + (" 20170224012016_R15", None, "R15"), + ( + "10.1002/(SICI)1099-1026(199905/06)14:3<195::AID-FFJ807>3.0.CO;2-C-BIB1", + None, + "BIB1", + ), + ("BFnrcardio201557_CR175", None, "CR175"), + ("2019121710443552100_", None, "2019121710443552100_"), + ] + for raw, doi, expected in test_pairs: + assert clean_ref_key(raw, doi=doi) == expected + + def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]: output = [] for ref in tei_dict.get("citations") or []: @@ -619,6 +671,10 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur if a.get("name"): assert isinstance(a["name"], str) authors.append(a["name"]) + ref_index = ref.get("index") + if ref_index is not None: + # transform from 0-indexed to 1-indexed + ref_index = ref_index + 1 output.append( RefStructured( biblio=RefBiblio( @@ -636,15 +692,15 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur pmid=ref.get("pmid"), pmcid=clean_pmcid(ref.get("pmcid")), arxiv_id=ref.get("arxiv_id"), - # isbn13: Optional[str] + isbn=ref.get("isbn"), url=clean_url_conservative(ref.get("url")), ), release_ident=release.ident, work_ident=release.work_id, release_stage=release.release_stage, release_year=release.release_year, - index=ref.get("index"), - key=ref.get("id"), + index=ref_index, + key=clean_ref_key(ref.get("id")), locator=None, # target_release_id ref_source="grobid", @@ -658,14 +714,6 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: for ref in release.refs: ref_source = "fatcat" - key = ref.key - if key and release.ext_ids.doi and key.startswith(release.ext_ids.doi): - key = key.replace(release.ext_ids.doi, "") - if key and key.startswith("ref-"): - key = key[4:] - if key and key.startswith("b"): - key = key[1:] - if release.extra and release.extra.get("pubmed"): ref_source = "fatcat-pubmed" elif release.extra and release.extra.get("crossref"): @@ -676,6 +724,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: extra = ref.extra or dict() authors = extra.get("authors") or [] authors = [a for a in authors if type(a) == str] + ref_index = None + if ref.index is not None: + # transform from 0-indexed (release.refs) to 1-indexed (fatcat_refs) + ref_index = ref.index + 1 output.append( RefStructured( biblio=RefBiblio( @@ -689,18 +741,19 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: volume=extra.get("volume"), issue=extra.get("issue"), pages=extra.get("pages") or extra.get("page"), - doi=extra.get("doi"), + doi=clean_doi(extra.get("doi")), pmid=extra.get("pmid"), - pmcid=extra.get("pmcid"), + pmcid=clean_pmcid(extra.get("pmcid")), arxiv_id=extra.get("arxiv_id"), - isbn13=extra.get("isbn13"), + isbn=extra.get("isbn13") or extra.get("isbn"), url=clean_url_conservative(extra.get("url")), ), release_ident=release.ident, work_ident=release.work_id, + release_stage=release.release_stage, release_year=release.release_year, - index=ref.index, - key=key or None, + index=ref_index, + key=clean_ref_key(ref.key, doi=release.ext_ids.doi), locator=ref.locator, target_release_id=ref.target_release_id, ref_source=ref_source, @@ -724,26 +777,41 @@ def refs_from_crossref( authors = [ ref["author"], ] - key = ref.get("key") - if key and key.startswith(record["DOI"]): - key = key.replace(record["DOI"] + "-", "") - key = key.replace(record["DOI"], "") - if key and key.startswith("ref-"): - key = key[4:] + ref_title = ref.get("article-title") ref_container_name = ref.get("journal-title") if not ref_container_name: + ref_container_name = ref.get("container-title") + + # volume-title is often a book title + if not ref_title: + ref_title = ref.get("volume-title") + elif not ref_container_name: ref_container_name = ref.get("volume-title") + + # series-title is a bit weird in Crossref references. it is often + # passed alone and seems to be the article/book title miscategorized. + # other times it is a conference name. + series_title = ref.get("series-title") + if not ref_title: + ref_title = series_title + elif not ref_container_name: + ref_container_name = series_title + + year = ref.get("year") + if year: + year = clean_small_int(year) + else: + year = None date = ref.get("date") - year = None - if date and len(date) >= 4 and date[:4].isdigit(): + if date and not year and len(date) >= 4 and date[:4].isdigit(): year = int(date[:4]) - if year < 1000 or year > 2100: - year = None + if year and (year < 1000 or year > 2100): + year = None output.append( RefStructured( biblio=RefBiblio( unstructured=ref.get("unstructured"), - title=ref.get("article-title"), + title=ref_title, subtitle=ref.get("subtitle"), contrib_raw_names=authors, year=year, @@ -751,15 +819,18 @@ def refs_from_crossref( publisher=ref.get("publisher"), volume=ref.get("volume"), issue=ref.get("issue"), - pages=ref.get("page"), - doi=ref.get("DOI"), + pages=ref.get("first-page"), + version=ref.get("edition"), + doi=clean_doi(ref.get("DOI")), + isbn=ref.get("ISBN"), ), release_ident=release.ident, work_ident=release.work_id, + release_stage=release.release_stage, release_year=release.release_year, - index=i, - key=key or None, - locator=ref.get("first-page"), + index=i + 1, # 1-indexed + key=clean_ref_key(ref.get("key"), doi=record.get("DOI")), + # locator, target_release_id=None, ref_source=ref_source, ) @@ -795,7 +866,10 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: fulltext_refs: List[RefStructured] = [] # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+ - if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq': + if ( + heavy.grobid_fulltext + and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq" + ): fulltext_release = [ r for r in heavy.releases |