aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-06 16:25:21 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-06 16:50:00 -0800
commitf60d37fb2f9079e6707f9a253983b6ea07964e18 (patch)
treee36db11b2123a7a50d48dbf0f8449bcef9e13fcc
parentc1af67fc72c671c4dc40536960ab47e78195c881 (diff)
downloadfatcat-scholar-f60d37fb2f9079e6707f9a253983b6ea07964e18.tar.gz
fatcat-scholar-f60d37fb2f9079e6707f9a253983b6ea07964e18.zip
fetch GROBID-parsed refs along with crossref metadata
-rw-r--r--fatcat_scholar/sandcrawler.py6
-rw-r--r--fatcat_scholar/work_pipeline.py3
-rw-r--r--tests/test_work_pipeline.py3
3 files changed, 8 insertions, 4 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 087cdc6..5580841 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -39,8 +39,10 @@ class SandcrawlerPostgrestClient:
else:
return None
- def get_crossref(self, doi: str) -> Optional[Dict[str, Any]]:
- resp = requests.get(self.api_url + "/crossref", params=dict(doi="eq." + doi))
+ def get_crossref_with_refs(self, doi: str) -> Optional[Dict[str, Any]]:
+ resp = requests.get(
+ self.api_url + "/crossref_with_refs", params=dict(doi="eq." + doi)
+ )
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 9dce006..fa1a7bc 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -276,13 +276,14 @@ class WorkPipeline:
# if this is definitely a Datacite or JALC DOI, can skip the Crossref cache lookup
return None
doi = re.ext_ids.doi.lower()
- crossref_meta = self.sandcrawler_db_client.get_crossref(doi)
+ crossref_meta = self.sandcrawler_db_client.get_crossref_with_refs(doi)
if not crossref_meta or not crossref_meta.get("record"):
return None
return dict(
release_ident=re.ident,
doi=doi,
record=crossref_meta["record"],
+ grobid_refs=crossref_meta["refs_json"] or [],
)
def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
diff --git a/tests/test_work_pipeline.py b/tests/test_work_pipeline.py
index a28f728..bc8a79a 100644
--- a/tests/test_work_pipeline.py
+++ b/tests/test_work_pipeline.py
@@ -73,7 +73,7 @@ def test_run_transform(mocker: Any) -> None:
responses.add(
responses.GET,
- "http://disabled-during-tests-bogus.xyz:3333/crossref?doi=eq.10.7717%2Fpeerj.4375",
+ "http://disabled-during-tests-bogus.xyz:3333/crossref_with_refs?doi=eq.10.7717%2Fpeerj.4375",
status=200,
json=[
{
@@ -83,6 +83,7 @@ def test_run_transform(mocker: Any) -> None:
"title": "something",
"TODO_better_object": 3,
},
+ "refs_json": [],
}
],
)