summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-06 16:52:18 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-06 17:00:50 -0800
commit7fa9e7dd83e41f3d331cb6b10df5f950f3d5ec8f (patch)
tree23c13a0aca8d558eed412958e0e21e91eecaa2b7 /fatcat_scholar
parentf60d37fb2f9079e6707f9a253983b6ea07964e18 (diff)
downloadfatcat-scholar-7fa9e7dd83e41f3d331cb6b10df5f950f3d5ec8f.tar.gz
fatcat-scholar-7fa9e7dd83e41f3d331cb6b10df5f950f3d5ec8f.zip
refs: include GROBID-parsed crossref refs
This takes advantage of Crossref 'unstructured' refs which have been parsed using GROBID and stored in the sandcrawler database, as part of the sandcrawler crossref metadata pipeline.
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/transform.py56
1 files changed, 52 insertions, 4 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index d40e123..207c325 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -697,6 +697,7 @@ def refs_from_grobid(
if a.full_name:
assert isinstance(a.full_name, str)
authors.append(a.full_name)
+ # TODO: else wheat?
ref_index = ref.index
if ref_index is not None:
# transform from 0-indexed to 1-indexed
@@ -794,10 +795,57 @@ def refs_from_crossref(
record = crossref["record"]
if not record.get("reference"):
return []
+ grobid_refs = dict()
+ for ref in crossref.get("grobid_refs") or []:
+ # TODO: some kind of check whether we should include this, based on
+ # source date or similar?
+ grobid_refs[ref["id"]] = ref
output = []
for i, ref in enumerate(record.get("reference", [])):
- ref_source = "crossref"
- authors: Optional[List[str]] = None
+ if ref.get("unstructured") and ref["key"] in grobid_refs:
+ # use the GROBID-parsed ref instead of the crossref ref itself
+ # the schema is that of GrobidBiblio
+ grobid_ref = grobid_refs[ref["key"]]
+ ref_date = grobid_ref.get("date")
+ ref_year: Optional[int] = None
+ if ref_date and len(ref_date) >= 4 and ref_date[:4].isdigit():
+ ref_year = int(ref_date[:4])
+ authors: List[str] = []
+ for a in grobid_ref.get("authors") or []:
+ if a.get("full_name"):
+ authors.append(a["full_name"])
+ output.append(
+ RefStructured(
+ biblio=RefBiblio(
+ title=grobid_ref.get("title"),
+ # subtitle
+ contrib_raw_names=authors or None,
+ year=ref_year,
+ container_name=grobid_ref.get("journal"),
+ publisher=grobid_ref.get("publisher"),
+ volume=grobid_ref.get("volume"),
+ issue=grobid_ref.get("issue"),
+ pages=grobid_ref.get("pages"),
+ doi=clean_doi(grobid_ref.get("doi")),
+ pmid=grobid_ref.get("pmid"),
+ pmcid=clean_pmcid(grobid_ref.get("pmcid")),
+ arxiv_id=grobid_ref.get("arxiv_id"),
+ url=clean_url_conservative(grobid_ref.get("url")),
+ ),
+ release_ident=release.ident,
+ work_ident=release.work_id,
+ release_stage=release.release_stage,
+ release_year=release.release_year,
+ index=i + 1, # 1-indexed
+ key=clean_ref_key(ref.get("key"), doi=record.get("DOI")),
+ # locator,
+ target_release_id=None,
+ ref_source="crossref-grobid",
+ )
+ )
+ continue
+
+ authors = []
if ref.get("author"):
authors = [
ref["author"],
@@ -838,7 +886,7 @@ def refs_from_crossref(
unstructured=ref.get("unstructured"),
title=ref_title,
subtitle=ref.get("subtitle"),
- contrib_raw_names=authors,
+ contrib_raw_names=authors or None,
year=year,
container_name=ref_container_name,
publisher=ref.get("publisher"),
@@ -857,7 +905,7 @@ def refs_from_crossref(
key=clean_ref_key(ref.get("key"), doi=record.get("DOI")),
# locator,
target_release_id=None,
- ref_source=ref_source,
+ ref_source="crossref",
)
)
return output