From 7fa9e7dd83e41f3d331cb6b10df5f950f3d5ec8f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 6 Dec 2021 16:52:18 -0800 Subject: refs: include GROBID-parsed crossref refs This takes advantage of Crossref 'unstructured' refs which have been parsed using GROBID and stored in the sandcrawler database, as part of the sandcrawler crossref metadata pipeline. --- fatcat_scholar/transform.py | 56 +++++++++++++++++++++++++++++--- tests/files/example_crossref_record.json | 1 + 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index d40e123..207c325 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -697,6 +697,7 @@ def refs_from_grobid( if a.full_name: assert isinstance(a.full_name, str) authors.append(a.full_name) + # TODO: else wheat? ref_index = ref.index if ref_index is not None: # transform from 0-indexed to 1-indexed @@ -794,10 +795,57 @@ def refs_from_crossref( record = crossref["record"] if not record.get("reference"): return [] + grobid_refs = dict() + for ref in crossref.get("grobid_refs") or []: + # TODO: some kind of check whether we should include this, based on + # source date or similar? + grobid_refs[ref["id"]] = ref output = [] for i, ref in enumerate(record.get("reference", [])): - ref_source = "crossref" - authors: Optional[List[str]] = None + if ref.get("unstructured") and ref["key"] in grobid_refs: + # use the GROBID-parsed ref instead of the crossref ref itself + # the schema is that of GrobidBiblio + grobid_ref = grobid_refs[ref["key"]] + ref_date = grobid_ref.get("date") + ref_year: Optional[int] = None + if ref_date and len(ref_date) >= 4 and ref_date[:4].isdigit(): + ref_year = int(ref_date[:4]) + authors: List[str] = [] + for a in grobid_ref.get("authors") or []: + if a.get("full_name"): + authors.append(a["full_name"]) + output.append( + RefStructured( + biblio=RefBiblio( + title=grobid_ref.get("title"), + # subtitle + contrib_raw_names=authors or None, + year=ref_year, + container_name=grobid_ref.get("journal"), + publisher=grobid_ref.get("publisher"), + volume=grobid_ref.get("volume"), + issue=grobid_ref.get("issue"), + pages=grobid_ref.get("pages"), + doi=clean_doi(grobid_ref.get("doi")), + pmid=grobid_ref.get("pmid"), + pmcid=clean_pmcid(grobid_ref.get("pmcid")), + arxiv_id=grobid_ref.get("arxiv_id"), + url=clean_url_conservative(grobid_ref.get("url")), + ), + release_ident=release.ident, + work_ident=release.work_id, + release_stage=release.release_stage, + release_year=release.release_year, + index=i + 1, # 1-indexed + key=clean_ref_key(ref.get("key"), doi=record.get("DOI")), + # locator, + target_release_id=None, + ref_source="crossref-grobid", + ) + ) + continue + + authors = [] if ref.get("author"): authors = [ ref["author"], @@ -838,7 +886,7 @@ def refs_from_crossref( unstructured=ref.get("unstructured"), title=ref_title, subtitle=ref.get("subtitle"), - contrib_raw_names=authors, + contrib_raw_names=authors or None, year=year, container_name=ref_container_name, publisher=ref.get("publisher"), @@ -857,7 +905,7 @@ def refs_from_crossref( key=clean_ref_key(ref.get("key"), doi=record.get("DOI")), # locator, target_release_id=None, - ref_source=ref_source, + ref_source="crossref", ) ) return output diff --git a/tests/files/example_crossref_record.json b/tests/files/example_crossref_record.json index d87c7c2..e96acfe 100644 --- a/tests/files/example_crossref_record.json +++ b/tests/files/example_crossref_record.json @@ -220,6 +220,7 @@ ], "type": "journal-article" }, + "grobid_refs": [], "release_ident": "arzkbn5brjf2nitdy4fkiusc4q" } -- cgit v1.2.3