From 7497d1baf0c3a9c24f5b9ce05c9567e555e4e6c9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 16 Oct 2020 13:51:49 -0700 Subject: transform: refactor tag generation out of transform heavy method --- fatcat_scholar/transform.py | 65 ++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 04922ba..c08be7b 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -290,6 +290,42 @@ def biblio_metadata_hacks(biblio: ScholarBiblio) -> ScholarBiblio: # noqa: C901 return biblio +def generate_tags( + biblio: ScholarBiblio, primary_release: Optional[ReleaseEntity] +) -> List[str]: + + tags = [] + + # tags + if biblio.license_slug and biblio.license_slug.lower().startswith("cc-"): + tags.append("oa") + if primary_release and primary_release.container: + container = primary_release.container + if container.extra: + if container.extra.get("doaj"): + tags.append("doaj") + tags.append("oa") + if container.extra.get("road"): + tags.append("road") + tags.append("oa") + if container.extra.get("szczepanski"): + tags.append("szczepanski") + tags.append("oa") + if container.extra.get("ia", {}).get("longtail_oa"): + tags.append("longtail") + tags.append("oa") + if container.extra.get("sherpa_romeo", {}).get("color") == "white": + tags.append("oa") + if container.extra.get("default_license", "").lower().startswith("cc-"): + tags.append("oa") + if container.extra.get("platform"): + # scielo, ojs, wordpress, etc + tags.append(container.extra["platform"].lower()) + if biblio.doi_prefix == "10.2307": + tags.append("jstor") + return list(set(tags)) + + def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: tags: List[str] = [] @@ -386,34 +422,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: # TODO: additional abstracts - # tags - if biblio.license_slug and biblio.license_slug.lower().startswith("cc-"): - tags.append("oa") - if primary_release and primary_release.container: - container = primary_release.container - if container.extra: - if container.extra.get("doaj"): - tags.append("doaj") - tags.append("oa") - if container.extra.get("road"): - tags.append("road") - tags.append("oa") - if container.extra.get("szczepanski"): - tags.append("szczepanski") - tags.append("oa") - if container.extra.get("ia", {}).get("longtail_oa"): - tags.append("longtail") - tags.append("oa") - if container.extra.get("sherpa_romeo", {}).get("color") == "white": - tags.append("oa") - if container.extra.get("default_license", "").lower().startswith("cc-"): - tags.append("oa") - if container.extra.get("platform"): - # scielo, ojs, wordpress, etc - tags.append(container.extra["platform"].lower()) - if biblio.doi_prefix == "10.2307": - tags.append("jstor") - tags = list(set(tags)) + tags = generate_tags(biblio, primary_release) # biorxiv/medrxiv hacks if ( -- cgit v1.2.3