work pipeline: add hack to enrich releases with some crossref metadata

author: Bryan Newbold <bnewbold@archive.org> 2021-12-06 16:17:45 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2021-12-06 16:17:45 -0800
commit: 16336ee0ae3f6785b31f7198a1bf497f3d89bf05 (patch)
tree: f3942d09228f38d7478dffd6250b0aeecad6d0ba /fatcat_scholar/work_pipeline.py
parent: aa72b77ba0268adac1583b96eb4ac40dbdfc9e4c (diff)
download: fatcat-scholar-16336ee0ae3f6785b31f7198a1bf497f3d89bf05.tar.gz
fatcat-scholar-16336ee0ae3f6785b31f7198a1bf497f3d89bf05.zip
1 files changed, 43 insertions, 12 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 93e7aa2..9dce006 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -89,6 +89,26 @@ def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]:
     return [r.ident for r in releases_sorted]
 
 
+def enrich_release_from_crossref(
+    release: ReleaseEntity, record: Dict[str, Any]
+) -> ReleaseEntity:
+    """
+    Hack to copy some SIM-relevant fields from Crossref record to release entity.
+
+    We should really update fatcat catalog itself with these fields, instead of
+    doing the update here in the scholar pipeline, but that is a more delicate
+    update, and we expect this to help make SIM matches faster (late 2021/early
+    2022).
+    """
+    if release.volume is None and record.get("volume"):
+        release.volume = clean_str(record["volume"])
+    if release.issue is None and record.get("issue"):
+        release.issue = clean_str(record["issue"])
+    if release.pages is None and record.get("pages"):
+        release.pages = clean_str(record["page"])
+    return release
+
+
 class WorkPipeline:
     def __init__(
         self,
@@ -270,10 +290,16 @@ class WorkPipeline:
         Checks in IssueDB to see if this release is likely to have a copy in a
         SIM issue item.
 
-        volume
-        issue
+        Releases must have all of these fields to be considered:
+
+        - container_id
+        - volume
+        - issue
+        - pages
         """
-        if not (release.container_id and release.volume and release.issue):
+        if not (
+            release.container_id and release.volume and release.issue and release.pages
+        ):
             return None
         sim_pubid = self.issue_db.container2pubid(release.container_id)
         if not sim_pubid:
@@ -375,6 +401,19 @@ class WorkPipeline:
 
         # print(f"pref_idents={pref_idents}", file=sys.stderr)
 
+        # lookup best available crossref biblio metadata
+        biblio_crossref = None
+        for ident in pref_idents:
+            release = release_dict[ident]
+            biblio_crossref = self.fetch_crossref(release)
+            if biblio_crossref:
+                assert biblio_crossref["release_ident"] == release.ident == ident
+                # HACK: copy some fields from crossref to release
+                release_dict[ident] = enrich_release_from_crossref(
+                    release, biblio_crossref["record"]
+                )
+                break
+
         # find best accessible fatcat file
         grobid_fulltext: Optional[Any] = None
         pdf_meta: Optional[Any] = None
@@ -422,7 +461,7 @@ class WorkPipeline:
             sim_pub = self.issue_db.lookup_pub(sim_issue.sim_pubid)
             if not sim_pub:
                 continue
-            # XXX: control flow tweak?
+            sim_fulltext = None
             try:
                 sim_fulltext = self.fetch_sim(
                     sim_issue, sim_pub, release.pages, release.ident
@@ -437,14 +476,6 @@ class WorkPipeline:
             if sim_fulltext:
                 break
 
-        # lookup best available crossref biblio metadata
-        biblio_crossref = None
-        for ident in pref_idents:
-            release = release_dict[ident]
-            biblio_crossref = self.fetch_crossref(release_dict[pref_idents[0]])
-            if biblio_crossref:
-                break
-
         return IntermediateBundle(
             doc_type=DocType.work,
             releases=releases,
author	Bryan Newbold <bnewbold@archive.org>	2021-12-06 16:17:45 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2021-12-06 16:17:45 -0800
commit	16336ee0ae3f6785b31f7198a1bf497f3d89bf05 (patch)
tree	f3942d09228f38d7478dffd6250b0aeecad6d0ba /fatcat_scholar/work_pipeline.py
parent	aa72b77ba0268adac1583b96eb4ac40dbdfc9e4c (diff)
download	fatcat-scholar-16336ee0ae3f6785b31f7198a1bf497f3d89bf05.tar.gz fatcat-scholar-16336ee0ae3f6785b31f7198a1bf497f3d89bf05.zip