diff options
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 4 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 14 | ||||
-rw-r--r-- | fatcat_scholar/worker.py | 10 | ||||
-rw-r--r-- | tests/test_work_pipeline.py | 5 |
4 files changed, 11 insertions, 22 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 9b033b8..0501f8e 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -39,9 +39,7 @@ class SandcrawlerPostgrestClient: return None def get_crossref(self, doi: str) -> Optional[Dict[str, Any]]: - resp = requests.get( - self.api_url + "/crossref", params=dict(doi="eq." + doi) - ) + resp = requests.get(self.api_url + "/crossref", params=dict(doi="eq." + doi)) resp.raise_for_status() resp_json = resp.json() if resp_json: diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index b90b747..2b09821 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -218,9 +218,7 @@ class WorkPipeline: webcapture_ident=wc.ident, ) - def fetch_crossref( - self, re: ReleaseEntity - ) -> Optional[Dict[str, Any]]: + def fetch_crossref(self, re: ReleaseEntity) -> Optional[Dict[str, Any]]: """ Fetches (cached) crossref metadata JSON from sandcrawler-db via postgrest HTTP interface. @@ -234,16 +232,18 @@ class WorkPipeline: if not re.ext_ids.doi: # can't do lookup without a DOI return None - if re.extra and (not re.extra.get('crossref')) and (re.extra.get('datacite') or re.extra.get('jalc')): + if ( + re.extra + and (not re.extra.get("crossref")) + and (re.extra.get("datacite") or re.extra.get("jalc")) + ): # if this is definitely a Datacite or JALC DOI, can skip the Crossref cache lookup return None doi = re.ext_ids.doi.lower() crossref_meta = self.sandcrawler_db_client.get_crossref(doi) if not crossref_meta or not crossref_meta.get("record"): return None - return dict( - release_ident=re.ident, doi=doi, record=crossref_meta["record"], - ) + return dict(release_ident=re.ident, doi=doi, record=crossref_meta["record"],) def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: """ diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py index 7787d42..7d2b3d6 100644 --- a/fatcat_scholar/worker.py +++ b/fatcat_scholar/worker.py @@ -1,6 +1,5 @@ import os import sys -import json import argparse import datetime from typing import List, Any @@ -10,7 +9,6 @@ import sentry_sdk import elasticsearch import elasticsearch.helpers import fatcat_openapi_client -from fatcat_openapi_client import ReleaseEntity from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.issue_db import IssueDB @@ -18,12 +16,8 @@ from fatcat_scholar.sandcrawler import ( SandcrawlerPostgrestClient, SandcrawlerMinioClient, ) -from fatcat_scholar.schema import ( - DocType, - IntermediateBundle, -) +from fatcat_scholar.schema import IntermediateBundle from fatcat_scholar.transform import transform_heavy -from fatcat_scholar.api_entities import entity_from_json from fatcat_scholar.work_pipeline import WorkPipeline from fatcat_scholar.sim_pipeline import SimPipeline from fatcat_scholar.kafka import KafkaWorker @@ -124,7 +118,7 @@ class IndexDocsWorker(KafkaWorker): bulk_actions = [] for obj in batch: bundle = IntermediateBundle.from_json(obj) - assert bundle.get('doc_type') + assert bundle.doc_type es_doc = transform_heavy(bundle) if not es_doc: continue diff --git a/tests/test_work_pipeline.py b/tests/test_work_pipeline.py index bf423b7..977a708 100644 --- a/tests/test_work_pipeline.py +++ b/tests/test_work_pipeline.py @@ -77,10 +77,7 @@ def test_run_transform(mocker: Any) -> None: { "doi": "10.7717/peerj.4375", "indexed": "2020-07-07T02:15:52.98309+00:00", - "record": { - "title": "something", - "TODO_better_object": 3, - }, + "record": {"title": "something", "TODO_better_object": 3,}, } ], ) |