aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/work_pipeline.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r--fatcat_scholar/work_pipeline.py35
1 files changed, 35 insertions, 0 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 7b477a0..b90b747 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -218,6 +218,33 @@ class WorkPipeline:
webcapture_ident=wc.ident,
)
+ def fetch_crossref(
+ self, re: ReleaseEntity
+ ) -> Optional[Dict[str, Any]]:
+ """
+ Fetches (cached) crossref metadata JSON from sandcrawler-db via
+ postgrest HTTP interface.
+
+ Returns a JSON object on success, or None if not found.
+
+ release_ident: Optional[str]
+ doi: Optional[str]
+ record: Optional[str]
+ """
+ if not re.ext_ids.doi:
+ # can't do lookup without a DOI
+ return None
+ if re.extra and (not re.extra.get('crossref')) and (re.extra.get('datacite') or re.extra.get('jalc')):
+ # if this is definitely a Datacite or JALC DOI, can skip the Crossref cache lookup
+ return None
+ doi = re.ext_ids.doi.lower()
+ crossref_meta = self.sandcrawler_db_client.get_crossref(doi)
+ if not crossref_meta or not crossref_meta.get("record"):
+ return None
+ return dict(
+ release_ident=re.ident, doi=doi, record=crossref_meta["record"],
+ )
+
def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
"""
Checks in IssueDB to see if this release is likely to have a copy in a
@@ -385,6 +412,14 @@ class WorkPipeline:
if sim_fulltext:
break
+ # lookup best available crossref biblio metadata
+ biblio_crossref = None
+ for ident in pref_idents:
+ release = release_dict[ident]
+ biblio_crossref = self.fetch_crossref(release_dict[pref_idents[0]])
+ if biblio_crossref:
+ break
+
return IntermediateBundle(
doc_type=DocType.work,
releases=releases,