From 4a40c62f6616825342bb23d03b9c4b9eebfe809c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 1 Jun 2021 01:05:23 -0700 Subject: add 'crossref' hydration to work pipeline The immediate motivation is to include recent crossref refs in citation graph transforms. May also be valuable for researchers to have authoritative/publisher metadata in the bundle dumps. --- fatcat_scholar/sandcrawler.py | 11 +++++++++++ fatcat_scholar/work_pipeline.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_work_pipeline.py | 16 ++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 356b373..9b033b8 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -38,6 +38,17 @@ class SandcrawlerPostgrestClient: else: return None + def get_crossref(self, doi: str) -> Optional[Dict[str, Any]]: + resp = requests.get( + self.api_url + "/crossref", params=dict(doi="eq." + doi) + ) + resp.raise_for_status() + resp_json = resp.json() + if resp_json: + return resp_json[0] + else: + return None + class SandcrawlerMinioClient: def __init__( diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 7b477a0..b90b747 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -218,6 +218,33 @@ class WorkPipeline: webcapture_ident=wc.ident, ) + def fetch_crossref( + self, re: ReleaseEntity + ) -> Optional[Dict[str, Any]]: + """ + Fetches (cached) crossref metadata JSON from sandcrawler-db via + postgrest HTTP interface. + + Returns a JSON object on success, or None if not found. + + release_ident: Optional[str] + doi: Optional[str] + record: Optional[str] + """ + if not re.ext_ids.doi: + # can't do lookup without a DOI + return None + if re.extra and (not re.extra.get('crossref')) and (re.extra.get('datacite') or re.extra.get('jalc')): + # if this is definitely a Datacite or JALC DOI, can skip the Crossref cache lookup + return None + doi = re.ext_ids.doi.lower() + crossref_meta = self.sandcrawler_db_client.get_crossref(doi) + if not crossref_meta or not crossref_meta.get("record"): + return None + return dict( + release_ident=re.ident, doi=doi, record=crossref_meta["record"], + ) + def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: """ Checks in IssueDB to see if this release is likely to have a copy in a @@ -385,6 +412,14 @@ class WorkPipeline: if sim_fulltext: break + # lookup best available crossref biblio metadata + biblio_crossref = None + for ident in pref_idents: + release = release_dict[ident] + biblio_crossref = self.fetch_crossref(release_dict[pref_idents[0]]) + if biblio_crossref: + break + return IntermediateBundle( doc_type=DocType.work, releases=releases, diff --git a/tests/test_work_pipeline.py b/tests/test_work_pipeline.py index e0e4a82..bf423b7 100644 --- a/tests/test_work_pipeline.py +++ b/tests/test_work_pipeline.py @@ -69,6 +69,22 @@ def test_run_transform(mocker: Any) -> None: ], ) + responses.add( + responses.GET, + "http://disabled-during-tests-bogus.xyz:3333/crossref?doi=eq.10.7717%2Fpeerj.4375", + status=200, + json=[ + { + "doi": "10.7717/peerj.4375", + "indexed": "2020-07-07T02:15:52.98309+00:00", + "record": { + "title": "something", + "TODO_better_object": 3, + }, + } + ], + ) + es_raw = mocker.patch("fatcat_scholar.work_pipeline.WorkPipeline.fetch_file_grobid") es_raw.side_effect = [ {"tei_xml": "dummy", "release_ident": "asdf123", "file_ident": "xyq9876"}, -- cgit v1.2.3