aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-06 16:52:18 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-06 17:00:50 -0800
commit7fa9e7dd83e41f3d331cb6b10df5f950f3d5ec8f (patch)
tree23c13a0aca8d558eed412958e0e21e91eecaa2b7
parentf60d37fb2f9079e6707f9a253983b6ea07964e18 (diff)
downloadfatcat-scholar-7fa9e7dd83e41f3d331cb6b10df5f950f3d5ec8f.tar.gz
fatcat-scholar-7fa9e7dd83e41f3d331cb6b10df5f950f3d5ec8f.zip
refs: include GROBID-parsed crossref refs
This takes advantage of Crossref 'unstructured' refs which have been parsed using GROBID and stored in the sandcrawler database, as part of the sandcrawler crossref metadata pipeline.
-rw-r--r--fatcat_scholar/transform.py56
-rw-r--r--tests/files/example_crossref_record.json1
2 files changed, 53 insertions, 4 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index d40e123..207c325 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -697,6 +697,7 @@ def refs_from_grobid(
if a.full_name:
assert isinstance(a.full_name, str)
authors.append(a.full_name)
+ # TODO: else wheat?
ref_index = ref.index
if ref_index is not None:
# transform from 0-indexed to 1-indexed
@@ -794,10 +795,57 @@ def refs_from_crossref(
record = crossref["record"]
if not record.get("reference"):
return []
+ grobid_refs = dict()
+ for ref in crossref.get("grobid_refs") or []:
+ # TODO: some kind of check whether we should include this, based on
+ # source date or similar?
+ grobid_refs[ref["id"]] = ref
output = []
for i, ref in enumerate(record.get("reference", [])):
- ref_source = "crossref"
- authors: Optional[List[str]] = None
+ if ref.get("unstructured") and ref["key"] in grobid_refs:
+ # use the GROBID-parsed ref instead of the crossref ref itself
+ # the schema is that of GrobidBiblio
+ grobid_ref = grobid_refs[ref["key"]]
+ ref_date = grobid_ref.get("date")
+ ref_year: Optional[int] = None
+ if ref_date and len(ref_date) >= 4 and ref_date[:4].isdigit():
+ ref_year = int(ref_date[:4])
+ authors: List[str] = []
+ for a in grobid_ref.get("authors") or []:
+ if a.get("full_name"):
+ authors.append(a["full_name"])
+ output.append(
+ RefStructured(
+ biblio=RefBiblio(
+ title=grobid_ref.get("title"),
+ # subtitle
+ contrib_raw_names=authors or None,
+ year=ref_year,
+ container_name=grobid_ref.get("journal"),
+ publisher=grobid_ref.get("publisher"),
+ volume=grobid_ref.get("volume"),
+ issue=grobid_ref.get("issue"),
+ pages=grobid_ref.get("pages"),
+ doi=clean_doi(grobid_ref.get("doi")),
+ pmid=grobid_ref.get("pmid"),
+ pmcid=clean_pmcid(grobid_ref.get("pmcid")),
+ arxiv_id=grobid_ref.get("arxiv_id"),
+ url=clean_url_conservative(grobid_ref.get("url")),
+ ),
+ release_ident=release.ident,
+ work_ident=release.work_id,
+ release_stage=release.release_stage,
+ release_year=release.release_year,
+ index=i + 1, # 1-indexed
+ key=clean_ref_key(ref.get("key"), doi=record.get("DOI")),
+ # locator,
+ target_release_id=None,
+ ref_source="crossref-grobid",
+ )
+ )
+ continue
+
+ authors = []
if ref.get("author"):
authors = [
ref["author"],
@@ -838,7 +886,7 @@ def refs_from_crossref(
unstructured=ref.get("unstructured"),
title=ref_title,
subtitle=ref.get("subtitle"),
- contrib_raw_names=authors,
+ contrib_raw_names=authors or None,
year=year,
container_name=ref_container_name,
publisher=ref.get("publisher"),
@@ -857,7 +905,7 @@ def refs_from_crossref(
key=clean_ref_key(ref.get("key"), doi=record.get("DOI")),
# locator,
target_release_id=None,
- ref_source=ref_source,
+ ref_source="crossref",
)
)
return output
diff --git a/tests/files/example_crossref_record.json b/tests/files/example_crossref_record.json
index d87c7c2..e96acfe 100644
--- a/tests/files/example_crossref_record.json
+++ b/tests/files/example_crossref_record.json
@@ -220,6 +220,7 @@
],
"type": "journal-article"
},
+ "grobid_refs": [],
"release_ident": "arzkbn5brjf2nitdy4fkiusc4q"
}