aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/work_pipeline.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-06 16:17:45 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-06 16:17:45 -0800
commit16336ee0ae3f6785b31f7198a1bf497f3d89bf05 (patch)
treef3942d09228f38d7478dffd6250b0aeecad6d0ba /fatcat_scholar/work_pipeline.py
parentaa72b77ba0268adac1583b96eb4ac40dbdfc9e4c (diff)
downloadfatcat-scholar-16336ee0ae3f6785b31f7198a1bf497f3d89bf05.tar.gz
fatcat-scholar-16336ee0ae3f6785b31f7198a1bf497f3d89bf05.zip
work pipeline: add hack to enrich releases with some crossref metadata
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r--fatcat_scholar/work_pipeline.py55
1 files changed, 43 insertions, 12 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 93e7aa2..9dce006 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -89,6 +89,26 @@ def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]:
return [r.ident for r in releases_sorted]
+def enrich_release_from_crossref(
+ release: ReleaseEntity, record: Dict[str, Any]
+) -> ReleaseEntity:
+ """
+ Hack to copy some SIM-relevant fields from Crossref record to release entity.
+
+ We should really update fatcat catalog itself with these fields, instead of
+ doing the update here in the scholar pipeline, but that is a more delicate
+ update, and we expect this to help make SIM matches faster (late 2021/early
+ 2022).
+ """
+ if release.volume is None and record.get("volume"):
+ release.volume = clean_str(record["volume"])
+ if release.issue is None and record.get("issue"):
+ release.issue = clean_str(record["issue"])
+ if release.pages is None and record.get("pages"):
+ release.pages = clean_str(record["page"])
+ return release
+
+
class WorkPipeline:
def __init__(
self,
@@ -270,10 +290,16 @@ class WorkPipeline:
Checks in IssueDB to see if this release is likely to have a copy in a
SIM issue item.
- volume
- issue
+ Releases must have all of these fields to be considered:
+
+ - container_id
+ - volume
+ - issue
+ - pages
"""
- if not (release.container_id and release.volume and release.issue):
+ if not (
+ release.container_id and release.volume and release.issue and release.pages
+ ):
return None
sim_pubid = self.issue_db.container2pubid(release.container_id)
if not sim_pubid:
@@ -375,6 +401,19 @@ class WorkPipeline:
# print(f"pref_idents={pref_idents}", file=sys.stderr)
+ # lookup best available crossref biblio metadata
+ biblio_crossref = None
+ for ident in pref_idents:
+ release = release_dict[ident]
+ biblio_crossref = self.fetch_crossref(release)
+ if biblio_crossref:
+ assert biblio_crossref["release_ident"] == release.ident == ident
+ # HACK: copy some fields from crossref to release
+ release_dict[ident] = enrich_release_from_crossref(
+ release, biblio_crossref["record"]
+ )
+ break
+
# find best accessible fatcat file
grobid_fulltext: Optional[Any] = None
pdf_meta: Optional[Any] = None
@@ -422,7 +461,7 @@ class WorkPipeline:
sim_pub = self.issue_db.lookup_pub(sim_issue.sim_pubid)
if not sim_pub:
continue
- # XXX: control flow tweak?
+ sim_fulltext = None
try:
sim_fulltext = self.fetch_sim(
sim_issue, sim_pub, release.pages, release.ident
@@ -437,14 +476,6 @@ class WorkPipeline:
if sim_fulltext:
break
- # lookup best available crossref biblio metadata
- biblio_crossref = None
- for ident in pref_idents:
- release = release_dict[ident]
- biblio_crossref = self.fetch_crossref(release_dict[pref_idents[0]])
- if biblio_crossref:
- break
-
return IntermediateBundle(
doc_type=DocType.work,
releases=releases,