diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-21 19:25:09 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-21 19:36:21 -0800 |
commit | 05cd1bb7bdcefe8ec596e572baafb9da9a8838b2 (patch) | |
tree | 5c49a54bf594e5d0008264984e12e1d1a881c1fe /fatcat_scholar | |
parent | 5096b0b7407aadfb97093615696f951829dc3506 (diff) | |
download | fatcat-scholar-05cd1bb7bdcefe8ec596e572baafb9da9a8838b2.tar.gz fatcat-scholar-05cd1bb7bdcefe8ec596e572baafb9da9a8838b2.zip |
refactor DOI domain lookup into python code; expand table
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/biblio_hacks.py | 82 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 14 | ||||
-rw-r--r-- | fatcat_scholar/templates/search_macros.html | 34 |
3 files changed, 101 insertions, 29 deletions
diff --git a/fatcat_scholar/biblio_hacks.py b/fatcat_scholar/biblio_hacks.py new file mode 100644 index 0000000..935d1ff --- /dev/null +++ b/fatcat_scholar/biblio_hacks.py @@ -0,0 +1,82 @@ +from typing import Optional + +DOI_PREFIX_MAP = { + # simple entries (mostly OA and platforms) + "10.2307": {"domain": "jstor.org"}, + "10.11501": {"domain": "ndl.go.jp"}, + "10.6084": {"domain": "figshare.com"}, + "10.5281": {"domain": "zenodo.org"}, + "10.1590": {"domain": "scielo.br"}, + "10.1371": {"domain": "plos.org"}, + "10.1155": {"domain": "hindawi.com"}, + "10.7554": {"domain": "elifesciences.com"}, + "10.1145": {"domain": "acm.org"}, + # more complex publisher mappings (verify journal/publisher) + "10.1016": {"domain": "elsevier.com", "publisher": "elsevier"}, + "10.1007": {"domain": "springer.com", "publisher": "springer"}, + "10.1186": {"domain": "springer.com", "publisher": "springer"}, + "10.1002": {"domain": "wiley.com", "publisher": "wiley"}, + "10.1109": {"domain": "ieee.com", "publisher": "ieee"}, + "10.1080": {"domain": "tandfonline.com", "publisher": "informa"}, + "10.1093": {"domain": "oup.com", "publisher": "oxford"}, + "10.1111": {"domain": "sagepub.com", "publisher": "sage"}, + "10.1042": {"domain": "sagepub.com", "publisher": "sage"}, + "10.1177": {"domain": "sagepub.com", "publisher": "sage"}, + "10.1021": {"domain": "acs.org", "publisher": "acs"}, + "10.1017": {"domain": "cambridge.org", "publisher": "cambridge"}, + # "10.1097": {"domain": "lww.org", "publisher": "wolters"}, + "10.1515": {"domain": "degruyter.com", "publisher": "gruyter"}, + "10.1038": {"domain": "nature.com", "container_name": "nature"}, + "10.1163": {"domain": "brill.com", "publisher": "brill"}, + "10.3390": {"domain": "mdpi.com", "publisher": "mdpi"}, + "10.1128": {"domain": "asm.org", "publisher": "microbiology"}, + "10.1103": {"domain": "aps.org", "publisher": "physical"}, + "10.3389": {"domain": "frontiersin.org", "publisher": "frontiers"}, + "10.1136": {"domain": "bmj.org", "publisher": "bmj"}, + "10.1088": {"domain": "iop.org", "publisher": "iop"}, + "10.1086": {"domain": "iop.org", "publisher": "iop"}, + "10.1142": {"domain": "worldscientific.com", "publisher": "world"}, + "10.1126": {"domain": "sciencemag.org", "container_name": "science"}, + "10.1162": {"domain": "mitpressjournals.org", "publisher": "mit"}, + "10.1045": {"domain": "dlib.org", "container_name": "d-lib"}, + "10.17723": {"domain": "archivists.org", "publisher": "archiv"}, + "10.2139": {"domain": "ssrn.com", "container_name": "social science"}, +} + + +def doi_link_domain( + doi_prefix: str, container_name: Optional[str], publisher: Optional[str] +) -> Optional[str]: + """ + Takes a DOI prefix and a publisher name, and tries to guess which domain + name the DOI will resolve to. This is used for display only. + + helpful: https://gist.github.com/TomDemeranville/8699224 + + TODO: JSTOR, biorxiv, medrxiv, zenodo, figshare, dryad, etc + """ + + # manual cases first + if doi_prefix == "10.1101" and container_name: + if "biorxiv" in container_name.lower(): + return "biorxiv.org" + elif "medrxiv" in container_name.lower(): + return "medrxiv.org" + else: + return None + elif doi_prefix == "10.1101" and container_name: + if "lancet" in container_name.lower(): + return "thelancet.com" + + # then the map + meta = DOI_PREFIX_MAP.get(doi_prefix) + if not meta: + return None + + if meta.get("publisher"): + if not publisher or meta["publisher"] not in publisher.lower(): + return None + if meta.get("container_name"): + if not container_name or meta["container_name"] not in container_name.lower(): + return None + return meta.get("domain") diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index d3a91a7..480b8fa 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -19,6 +19,7 @@ from pydantic import BaseModel from fatcat_openapi_client import ReleaseEntity, ReleaseContrib from fatcat_scholar.api_entities import entity_to_dict +from fatcat_scholar.biblio_hacks import doi_link_domain class DocType(str, Enum): @@ -104,6 +105,19 @@ class ScholarBiblio(BaseModel): contrib_names: List[str] affiliations: List[str] + def doi_link_domain(self, default: str = "doi.org") -> str: + if not self.doi_prefix: + return default + domain = doi_link_domain( + self.doi_prefix, + container_name=self.container_name, + publisher=self.publisher, + ) + if domain: + return domain + else: + return default + def citation_str(self, style: str) -> Optional[str]: """ Tries to format this biblio metadata as a citation string. If it fails, diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html index f580cee..25ba7f5 100644 --- a/fatcat_scholar/templates/search_macros.html +++ b/fatcat_scholar/templates/search_macros.html @@ -99,40 +99,16 @@ </a> {% endmacro %} -{% macro doi_access_button(biblio, is_oa=False) %} - {% if biblio.doi %} - {% set publisher = "" or (biblio.publisher and biblio.publisher.lower()) %} - {% set container_name = "" or (biblio.container_name and biblio.container_name.lower()) %} - <a target="_blank" rel="external noopener noreferrer" href="https://doi.org/{{ biblio.doi }}"> +{% macro doi_access_button(paper, is_oa=False) %} + {% if paper.biblio and paper.biblio.doi %} + <a target="_blank" rel="external noopener noreferrer" href="https://doi.org/{{ paper.biblio.doi }}"> <button class="ui left aligned compact blue labeled icon button serp-button"> {% if is_oa %} <i class="unlock alternate icon" style="background-color: #fb971f;"></i> {% else %} <i class="external alternate icon"></i> {% endif %} - {# TODO: detect prefix? JSTOR, biorxiv, medrxiv, zenodo, figshare, dryad, etc #} - {# helpful: https://gist.github.com/TomDemeranville/8699224 #} - {% if biblio.doi_prefix == "10.6084" %} - figshare.com - {% elif biblio.doi_prefix == "10.5281" %} - zenodo.org - {% elif biblio.doi_prefix == "10.1371" %} - plos.org - {% elif biblio.doi_prefix == "10.1101" and "biorxiv" in container_name %} - biorxiv.org - {% elif biblio.doi_prefix == "10.1101" and "medrxiv" in container_name %} - medrxiv.org - {% elif biblio.doi_prefix == "10.1016" and "elsevier" in publisher %} - elsevier.com - {% elif biblio.doi_prefix in ["10.1186", "10.1007"] and "springer" in publisher %} - springer.com - {% elif biblio.doi_prefix in ["10.1042", "10.1111", "10.1177"] and "sage" in publisher %} - sagepub.com - {% elif biblio.doi_prefix in ["10.1080"] and "taylor" in publisher %} - tandfonline.com - {% else %} - Publisher / doi.org - {% endif %} + {{ paper._obj.biblio.doi_link_domain("Publisher / doi.org") }} </button> </a> {% endif %} @@ -453,7 +429,7 @@ {% endif %} {# publisher / repository #} - {{ doi_access_button(paper.biblio, is_oa=("oa" in paper.tags)) }} + {{ doi_access_button(paper, is_oa=("oa" in paper.tags)) }} {# trusted platform fulltext links #} {{ platform_access_button(paper.biblio) }} |