add 'crossref' hydration to work pipeline

The immediate motivation is to include recent crossref refs in citation graph transforms. May also be valuable for researchers to have authoritative/publisher metadata in the bundle dumps.
author: Bryan Newbold <bnewbold@archive.org> 2021-06-01 01:05:23 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-06-02 00:38:51 -0700
commit: 4a40c62f6616825342bb23d03b9c4b9eebfe809c (patch)
tree: f999a05e76a4e2c3965dca2a12a07f654810f869
parent: 01a1978d5b9667df4ae71a7934512e6c4e3bf9a8 (diff)
download: fatcat-scholar-4a40c62f6616825342bb23d03b9c4b9eebfe809c.tar.gz
fatcat-scholar-4a40c62f6616825342bb23d03b9c4b9eebfe809c.zip
3 files changed, 62 insertions, 0 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 356b373..9b033b8 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -38,6 +38,17 @@ class SandcrawlerPostgrestClient:
         else:
             return None
 
+    def get_crossref(self, doi: str) -> Optional[Dict[str, Any]]:
+        resp = requests.get(
+            self.api_url + "/crossref", params=dict(doi="eq." + doi)
+        )
+        resp.raise_for_status()
+        resp_json = resp.json()
+        if resp_json:
+            return resp_json[0]
+        else:
+            return None
+
 
 class SandcrawlerMinioClient:
     def __init__(
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 7b477a0..b90b747 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -218,6 +218,33 @@ class WorkPipeline:
             webcapture_ident=wc.ident,
         )
 
+    def fetch_crossref(
+        self, re: ReleaseEntity
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Fetches (cached) crossref metadata JSON from sandcrawler-db via
+        postgrest HTTP interface.
+
+        Returns a JSON object on success, or None if not found.
+
+        release_ident: Optional[str]
+        doi: Optional[str]
+        record: Optional[str]
+        """
+        if not re.ext_ids.doi:
+            # can't do lookup without a DOI
+            return None
+        if re.extra and (not re.extra.get('crossref')) and (re.extra.get('datacite') or re.extra.get('jalc')):
+            # if this is definitely a Datacite or JALC DOI, can skip the Crossref cache lookup
+            return None
+        doi = re.ext_ids.doi.lower()
+        crossref_meta = self.sandcrawler_db_client.get_crossref(doi)
+        if not crossref_meta or not crossref_meta.get("record"):
+            return None
+        return dict(
+            release_ident=re.ident, doi=doi, record=crossref_meta["record"],
+        )
+
     def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
         """
         Checks in IssueDB to see if this release is likely to have a copy in a
@@ -385,6 +412,14 @@ class WorkPipeline:
             if sim_fulltext:
                 break
 
+        # lookup best available crossref biblio metadata
+        biblio_crossref = None
+        for ident in pref_idents:
+            release = release_dict[ident]
+            biblio_crossref = self.fetch_crossref(release_dict[pref_idents[0]])
+            if biblio_crossref:
+                break
+
         return IntermediateBundle(
             doc_type=DocType.work,
             releases=releases,
diff --git a/tests/test_work_pipeline.py b/tests/test_work_pipeline.py
index e0e4a82..bf423b7 100644
--- a/tests/test_work_pipeline.py
+++ b/tests/test_work_pipeline.py
@@ -69,6 +69,22 @@ def test_run_transform(mocker: Any) -> None:
         ],
     )
 
+    responses.add(
+        responses.GET,
+        "http://disabled-during-tests-bogus.xyz:3333/crossref?doi=eq.10.7717%2Fpeerj.4375",
+        status=200,
+        json=[
+            {
+                "doi": "10.7717/peerj.4375",
+                "indexed": "2020-07-07T02:15:52.98309+00:00",
+                "record": {
+                    "title": "something",
+                    "TODO_better_object": 3,
+                },
+            }
+        ],
+    )
+
     es_raw = mocker.patch("fatcat_scholar.work_pipeline.WorkPipeline.fetch_file_grobid")
     es_raw.side_effect = [
         {"tei_xml": "<xml>dummy", "release_ident": "asdf123", "file_ident": "xyq9876"},
author	Bryan Newbold <bnewbold@archive.org>	2021-06-01 01:05:23 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-06-02 00:38:51 -0700
commit	4a40c62f6616825342bb23d03b9c4b9eebfe809c (patch)
tree	f999a05e76a4e2c3965dca2a12a07f654810f869
parent	01a1978d5b9667df4ae71a7934512e6c4e3bf9a8 (diff)
download	fatcat-scholar-4a40c62f6616825342bb23d03b9c4b9eebfe809c.tar.gz fatcat-scholar-4a40c62f6616825342bb23d03b9c4b9eebfe809c.zip