diff options
| -rw-r--r-- | fatcat_scholar/sandcrawler.py | 11 | ||||
| -rw-r--r-- | fatcat_scholar/work_pipeline.py | 35 | ||||
| -rw-r--r-- | tests/test_work_pipeline.py | 16 | 
3 files changed, 62 insertions, 0 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 356b373..9b033b8 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -38,6 +38,17 @@ class SandcrawlerPostgrestClient:          else:              return None +    def get_crossref(self, doi: str) -> Optional[Dict[str, Any]]: +        resp = requests.get( +            self.api_url + "/crossref", params=dict(doi="eq." + doi) +        ) +        resp.raise_for_status() +        resp_json = resp.json() +        if resp_json: +            return resp_json[0] +        else: +            return None +  class SandcrawlerMinioClient:      def __init__( diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 7b477a0..b90b747 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -218,6 +218,33 @@ class WorkPipeline:              webcapture_ident=wc.ident,          ) +    def fetch_crossref( +        self, re: ReleaseEntity +    ) -> Optional[Dict[str, Any]]: +        """ +        Fetches (cached) crossref metadata JSON from sandcrawler-db via +        postgrest HTTP interface. + +        Returns a JSON object on success, or None if not found. + +        release_ident: Optional[str] +        doi: Optional[str] +        record: Optional[str] +        """ +        if not re.ext_ids.doi: +            # can't do lookup without a DOI +            return None +        if re.extra and (not re.extra.get('crossref')) and (re.extra.get('datacite') or re.extra.get('jalc')): +            # if this is definitely a Datacite or JALC DOI, can skip the Crossref cache lookup +            return None +        doi = re.ext_ids.doi.lower() +        crossref_meta = self.sandcrawler_db_client.get_crossref(doi) +        if not crossref_meta or not crossref_meta.get("record"): +            return None +        return dict( +            release_ident=re.ident, doi=doi, record=crossref_meta["record"], +        ) +      def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:          """          Checks in IssueDB to see if this release is likely to have a copy in a @@ -385,6 +412,14 @@ class WorkPipeline:              if sim_fulltext:                  break +        # lookup best available crossref biblio metadata +        biblio_crossref = None +        for ident in pref_idents: +            release = release_dict[ident] +            biblio_crossref = self.fetch_crossref(release_dict[pref_idents[0]]) +            if biblio_crossref: +                break +          return IntermediateBundle(              doc_type=DocType.work,              releases=releases, diff --git a/tests/test_work_pipeline.py b/tests/test_work_pipeline.py index e0e4a82..bf423b7 100644 --- a/tests/test_work_pipeline.py +++ b/tests/test_work_pipeline.py @@ -69,6 +69,22 @@ def test_run_transform(mocker: Any) -> None:          ],      ) +    responses.add( +        responses.GET, +        "http://disabled-during-tests-bogus.xyz:3333/crossref?doi=eq.10.7717%2Fpeerj.4375", +        status=200, +        json=[ +            { +                "doi": "10.7717/peerj.4375", +                "indexed": "2020-07-07T02:15:52.98309+00:00", +                "record": { +                    "title": "something", +                    "TODO_better_object": 3, +                }, +            } +        ], +    ) +      es_raw = mocker.patch("fatcat_scholar.work_pipeline.WorkPipeline.fetch_file_grobid")      es_raw.side_effect = [          {"tei_xml": "<xml>dummy", "release_ident": "asdf123", "file_ident": "xyq9876"},  | 
