From 16336ee0ae3f6785b31f7198a1bf497f3d89bf05 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 6 Dec 2021 16:17:45 -0800 Subject: work pipeline: add hack to enrich releases with some crossref metadata --- fatcat_scholar/work_pipeline.py | 55 ++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 93e7aa2..9dce006 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -89,6 +89,26 @@ def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]: return [r.ident for r in releases_sorted] +def enrich_release_from_crossref( + release: ReleaseEntity, record: Dict[str, Any] +) -> ReleaseEntity: + """ + Hack to copy some SIM-relevant fields from Crossref record to release entity. + + We should really update fatcat catalog itself with these fields, instead of + doing the update here in the scholar pipeline, but that is a more delicate + update, and we expect this to help make SIM matches faster (late 2021/early + 2022). + """ + if release.volume is None and record.get("volume"): + release.volume = clean_str(record["volume"]) + if release.issue is None and record.get("issue"): + release.issue = clean_str(record["issue"]) + if release.pages is None and record.get("pages"): + release.pages = clean_str(record["page"]) + return release + + class WorkPipeline: def __init__( self, @@ -270,10 +290,16 @@ class WorkPipeline: Checks in IssueDB to see if this release is likely to have a copy in a SIM issue item. - volume - issue + Releases must have all of these fields to be considered: + + - container_id + - volume + - issue + - pages """ - if not (release.container_id and release.volume and release.issue): + if not ( + release.container_id and release.volume and release.issue and release.pages + ): return None sim_pubid = self.issue_db.container2pubid(release.container_id) if not sim_pubid: @@ -375,6 +401,19 @@ class WorkPipeline: # print(f"pref_idents={pref_idents}", file=sys.stderr) + # lookup best available crossref biblio metadata + biblio_crossref = None + for ident in pref_idents: + release = release_dict[ident] + biblio_crossref = self.fetch_crossref(release) + if biblio_crossref: + assert biblio_crossref["release_ident"] == release.ident == ident + # HACK: copy some fields from crossref to release + release_dict[ident] = enrich_release_from_crossref( + release, biblio_crossref["record"] + ) + break + # find best accessible fatcat file grobid_fulltext: Optional[Any] = None pdf_meta: Optional[Any] = None @@ -422,7 +461,7 @@ class WorkPipeline: sim_pub = self.issue_db.lookup_pub(sim_issue.sim_pubid) if not sim_pub: continue - # XXX: control flow tweak? + sim_fulltext = None try: sim_fulltext = self.fetch_sim( sim_issue, sim_pub, release.pages, release.ident @@ -437,14 +476,6 @@ class WorkPipeline: if sim_fulltext: break - # lookup best available crossref biblio metadata - biblio_crossref = None - for ident in pref_idents: - release = release_dict[ident] - biblio_crossref = self.fetch_crossref(release_dict[pref_idents[0]]) - if biblio_crossref: - break - return IntermediateBundle( doc_type=DocType.work, releases=releases, -- cgit v1.2.3