diff options
Diffstat (limited to 'python/fatcat_tools/biblio_lookup_tables.py')
-rw-r--r-- | python/fatcat_tools/biblio_lookup_tables.py | 623 |
1 files changed, 623 insertions, 0 deletions
diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py new file mode 100644 index 00000000..a9a097ae --- /dev/null +++ b/python/fatcat_tools/biblio_lookup_tables.py @@ -0,0 +1,623 @@ +""" +This file contains lookup tables and other static data structures used in +bibliographic metadata munging. +""" + +from typing import Dict, Optional + +# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of +# 2/T and 2/B? +# PubMed/MEDLINE and JSTOR use these MARC codes +# https://www.loc.gov/marc/languages/language_name.html +LANG_MAP_MARC: Dict[str, Optional[str]] = { + "afr": "af", + "alb": "sq", + "amh": "am", + "ara": "ar", + "arm": "hy", + "aze": "az", + "ben": "bn", + "bos": "bs", + "bul": "bg", + "cat": "ca", + "chi": "zh", + "cze": "cs", + "dan": "da", + "dut": "nl", + "eng": "en", + "epo": "eo", + "est": "et", + "fin": "fi", + "fre": "fr", + "geo": "ka", + "ger": "de", + "gla": "gd", + "gre": "el", + "heb": "he", + "hin": "hi", + "hrv": "hr", + "hun": "hu", + "ice": "is", + "ind": "id", + "ita": "it", + "jpn": "ja", + "kin": "rw", + "kor": "ko", + "lat": "la", + "lav": "lv", + "lit": "lt", + "mac": "mk", + "mal": "ml", + "mao": "mi", + "may": "ms", + "nor": "no", + "per": "fa", + "per": "fa", + "pol": "pl", + "por": "pt", + "pus": "ps", + "rum": "ro", + "rus": "ru", + "san": "sa", + "slo": "sk", + "slv": "sl", + "spa": "es", + "srp": "sr", + "swe": "sv", + "tha": "th", + "tur": "tr", + "ukr": "uk", + "urd": "ur", + "vie": "vi", + "wel": "cy", + # additions + "gle": "ga", # "Irish" (Gaelic) + "jav": "jv", # Javanese + "welsh": "cy", # Welsh + "oci": "oc", # Occitan + # Don't have ISO 639-1 codes + "grc": "el", # Ancient Greek; map to modern greek + "map": None, # Austronesian (collection) + "syr": None, # Syriac, Modern + "gem": None, # Old Saxon + "non": None, # Old Norse + "emg": None, # Eastern Meohang + "neg": None, # Negidal + "mul": None, # Multiple languages + "und": None, # Undetermined +} + +# these are mappings from web domains to URL 'rel' for things like file entity +# URL notation +DOMAIN_REL_MAP: Dict[str, str] = { + "archive.org": "archive", + # LOCKSS, Portico, DuraSpace, etc would also be "archive" + "arxiv.org": "repository", + "babel.hathitrust.org": "repository", + "cds.cern.ch": "repository", + "deepblue.lib.umich.edu": "repository", + "europepmc.org": "repository", + "hal.inria.fr": "repository", + "scielo.isciii.es": "repository", + "www.dtic.mil": "repository", + "www.jstage.jst.go.jp": "repository", + "www.jstor.org": "repository", + "www.ncbi.nlm.nih.gov": "repository", + "ftp.ncbi.nlm.nih.gov": "repository", + "www.scielo.br": "repository", + "www.scielo.cl": "repository", + "www.scielo.org.mx": "repository", + "zenodo.org": "repository", + "www.biorxiv.org": "repository", + "www.medrxiv.org": "repository", + "citeseerx.ist.psu.edu": "aggregator", + "publisher-connector.core.ac.uk": "aggregator", + "core.ac.uk": "aggregator", + "static.aminer.org": "aggregator", + "aminer.org": "aggregator", + "pdfs.semanticscholar.org": "aggregator", + "semanticscholar.org": "aggregator", + "www.semanticscholar.org": "aggregator", + "academic.oup.com": "publisher", + "cdn.elifesciences.org": "publisher", + "cell.com": "publisher", + "dl.acm.org": "publisher", + "downloads.hindawi.com": "publisher", + "elifesciences.org": "publisher", + "iopscience.iop.org": "publisher", + "journals.plos.org": "publisher", + "link.springer.com": "publisher", + "onlinelibrary.wiley.com": "publisher", + "works.bepress.com": "publisher", + "www.biomedcentral.com": "publisher", + "www.cell.com": "publisher", + "www.nature.com": "publisher", + "www.pnas.org": "publisher", + "www.tandfonline.com": "publisher", + "www.frontiersin.org": "publisher", + "www.degruyter.com": "publisher", + "www.mdpi.com": "publisher", + "www.ahajournals.org": "publisher", + "ehp.niehs.nih.gov": "publisher", + "journals.tsu.ru": "publisher", + "www.cogentoa.com": "publisher", + "www.researchgate.net": "academicsocial", + "academia.edu": "academicsocial", + "wayback.archive-it.org": "webarchive", + "web.archive.org": "webarchive", + "archive.is": "webarchive", +} + +# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly +PUBMED_RELEASE_TYPE_MAP = { + # Adaptive Clinical Trial + "Address": "speech", + "Autobiography": "book", + # Bibliography + "Biography": "book", + # Case Reports + "Classical Article": "article-journal", + # Clinical Conference + # Clinical Study + # Clinical Trial + # Clinical Trial, Phase I + # Clinical Trial, Phase II + # Clinical Trial, Phase III + # Clinical Trial, Phase IV + # Clinical Trial Protocol + # Clinical Trial, Veterinary + # Collected Works + # Comparative Study + # Congress + # Consensus Development Conference + # Consensus Development Conference, NIH + # Controlled Clinical Trial + "Dataset": "dataset", + # Dictionary + # Directory + # Duplicate Publication + "Editorial": "editorial", + # English Abstract # doesn't indicate that this is abstract-only + # Equivalence Trial + # Evaluation Studies + # Expression of Concern + # Festschrift + # Government Document + # Guideline + "Historical Article": "article-journal", + # Interactive Tutorial + "Interview": "interview", + "Introductory Journal Article": "article-journal", + "Journal Article": "article-journal", + "Lecture": "speech", + "Legal Case": "legal_case", + "Legislation": "legislation", + "Letter": "letter", + # Meta-Analysis + # Multicenter Study + # News + "Newspaper Article": "article-newspaper", + # Observational Study + # Observational Study, Veterinary + # Overall + # Patient Education Handout + # Periodical Index + # Personal Narrative + # Portrait + # Practice Guideline + # Pragmatic Clinical Trial + # Publication Components + # Publication Formats + # Publication Type Category + # Randomized Controlled Trial + # Research Support, American Recovery and Reinvestment Act + # Research Support, N.I.H., Extramural + # Research Support, N.I.H., Intramural + # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. + # Research Support, U.S. Gov't, P.H.S. + # Review # in the "literature review" sense, not "product review" + # Scientific Integrity Review + # Study Characteristics + # Support of Research + # Systematic Review + "Technical Report": "report", + # Twin Study + # Validation Studies + # Video-Audio Media + # Webcasts +} + +MONTH_ABBR_MAP: Dict[str, int] = { + "Jan": 1, + "01": 1, + "Feb": 2, + "02": 2, + "Mar": 3, + "03": 3, + "Apr": 4, + "04": 4, + "May": 5, + "05": 5, + "Jun": 6, + "06": 6, + "Jul": 7, + "07": 7, + "Aug": 8, + "08": 8, + "Sep": 9, + "09": 9, + "Oct": 10, + "10": 10, + "Nov": 11, + "11": 11, + "Dec": 12, + "12": 12, +} + +# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ +COUNTRY_NAME_MAP: Dict[str, str] = { + "Afghanistan": "af", + "Albania": "al", + "Algeria": "dz", + "Andorra": "ad", + "Angola": "ao", + "Antigua and Barbuda": "ag", + "Argentina": "ar", + "Armenia": "am", + "Australia": "au", + "Austria": "at", + "Azerbaijan": "az", + "Bahamas": "bs", + "Bahrain": "bh", + "Bangladesh": "bd", + "Barbados": "bb", + "Belarus": "by", + "Belgium": "be", + "Belize": "bz", + "Benin": "bj", + "Bhutan": "bt", + "Bolivia": "bo", + "Bosnia and Herzegowina": "ba", + "Botswana": "bw", + "Brazil": "br", + "Brunei Darussalam": "bn", + "Bulgaria": "bg", + "Burkina Faso": "bf", + "Burundi": "bi", + "Cambodia": "kh", + "Cameroon": "cm", + "Canada": "ca", + "Cape Verde": "cv", + "Central African Republic": "cf", + "Chad": "td", + "Chile": "cl", + "China": "cn", + "Colombia": "co", + "Comoros": "km", + "Congo, Democratic Republic": "cd", + "Congo, People’s Republic": "cg", + "Costa Rica": "cr", + "Cote d'Ivoire": "ci", + "Croatia (Local Name: Hrvatska)": "hr", + "Cuba": "cu", + "Cyprus": "cy", + "Czech Republic": "cz", + "Denmark": "dk", + "Djibouti": "dj", + "Dominica": "dm", + "Dominican Republic": "do", + "East Timor": "tl", + "Ecuador": "ec", + "El Salvador": "sv", + "Equatorial Guinea": "gq", + "Eritrea": "er", + "Estonia": "ee", + "Ethiopia": "et", + "Fiji": "fj", + "Finland": "fi", + "France": "fr", + "Gabon": "ga", + "Gambia": "gm", + "Georgia": "ge", + "Germany": "de", + "Ghana": "gh", + "Greece": "gr", + "Greenland": "gl", + "Grenada": "gd", + "Guatemala": "gt", + "Guinea": "gn", + "Guinea-Bissau": "gw", + "Guyana": "gy", + "Haiti": "ht", + "Honduras": "hn", + "Hong Kong": "hk", + "Hungary": "hu", + "Iceland": "is", + "India": "in", + "Indonesia": "id", + "Iran": "ir", + "Iraq": "iq", + "Ireland": "ie", + "Israel": "il", + "Italy": "it", + "Jamaica": "jm", + "Japan": "jp", + "Jordan": "jo", + "Kazakhstan": "kz", + "Kenya": "ke", + "Kiribati": "ki", + "Korea, Democratic People's Republic": "kp", + "Korea, Republic": "kr", + "Kuwait": "kw", + "Kyrgyzstan": "kg", + "Laos": "la", + "Latvia": "lv", + "Lebanon": "lb", + "Lesotho": "ls", + "Liberia": "lr", + "Libya": "ly", + "Liechtenstein": "li", + "Lithuania": "lt", + "Luxembourg": "lu", + "Macedonia": "mk", + "Madagascar": "mg", + "Malawi": "mw", + "Malaysia": "my", + "Maldives": "mv", + "Mali": "ml", + "Malta": "mt", + "Marshall Islands": "mh", + "Mauritania": "mr", + "Mauritius": "mu", + "Mexico": "mx", + "Micronesia": "fm", + "Moldova": "md", + "Monaco": "mc", + "Mongolia": "mn", + "Morocco": "ma", + "Mozambique": "mz", + "Myanmar": "mm", + "Namibia": "na", + "Nauru": "nr", + "Nepal": "np", + "Netherlands": "nl", + "New Zealand": "nz", + "Nicaragua": "ni", + "Niger": "ne", + "Nigeria": "ng", + "Norway": "no", + "Oman": "om", + "Pakistan": "pk", + "Palau": "pw", + "Panama": "pa", + "Papua New Guinea": "pg", + "Paraguay": "py", + "Peru": "pe", + "Philippines": "ph", + "Poland": "pl", + "Portugal": "pt", + "Puerto Rico": "pr", + "Qatar": "qa", + "Romania": "ro", + "Russian Federation": "ru", + "Rwanda": "rw", + "Saint Kitts and Nevis": "kn", + "Saint Lucia": "lc", + "Saint Vincent and the Grenadines": "vc", + "Samoa": "ws", + "San Marino": "sm", + "Sao Tome and Príncipe": "st", + "Saudi Arabia": "sa", + "Senegal": "sn", + "Serbia and Montenegro": "cs", + "Seychelles": "sc", + "Sierra Leone": "sl", + "Singapore": "sg", + "Slovakia (Slovak Republic)": "sk", + "Slovenia": "si", + "Solomon Islands": "sb", + "Somalia": "so", + "South Africa": "za", + "Spain": "es", + "Sri Lanka": "lk", + "Sudan": "sd", + "Suriname": "sr", + "Swaziland": "sz", + "Sweden": "se", + "Switzerland": "ch", + "Syrian Arab Republic": "sy", + "Taiwan": "tw", + "Tajikistan": "tj", + "Tanzania": "tz", + "Tanzania": "tz", + "Thailand": "th", + "Togo": "tg", + "Tonga": "to", + "Trinidad and Tobago": "tt", + "Tunisia": "tn", + "Turkey": "tr", + "Turkmenistan": "tm", + "Tuvalu": "tv", + "Uganda": "ug", + "Ukraine": "ua", + "United Arab Emirates": "ae", + "United Kingdom": "gb", + "United States": "us", + "Uruguay": "uy", + # Additions from running over large files + "Bosnia and Herzegovina": "ba", + # "International" + "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn + "Russia (Federation)": "ru", + "Scotland": "gb", + "England": "gb", + "Korea (South)": "kr", + "Georgia (Republic)": "ge", + "Egypt": "eg", +} + +CONTAINER_TYPE_MAP: Dict[str, str] = { + "article-journal": "journal", + "paper-conference": "conference", + "book": "book-series", +} + +# These are based, informally, on sorting the most popular licenses found in +# Crossref metadata. There were over 500 unique strings and only a few most +# popular are here; many were variants of the CC URLs. Would be useful to +# normalize CC licenses better. +# The current norm is to only add license slugs that are at least partially OA. +LICENSE_SLUG_MAP: Dict[str, str] = { + "//creativecommons.org/publicdomain/mark/1.0": "CC-0", + "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", + "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", + "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", + "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", + "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", + "//creativecommons.org/licenses/by/2.0/": "CC-BY", + "//creativecommons.org/licenses/by/3.0/": "CC-BY", + "//creativecommons.org/licenses/by/4.0/": "CC-BY", + "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", + "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", + "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", + "//spdx.org/licenses/CC0-1.0.json": "CC-0", + "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", + "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", + "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", + "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", + "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", + "//spdx.org/licenses/MIT.json": "MIT", + "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", + "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", + "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0", + "//www.karger.com/Services/SiteLicenses": "KARGER", + "//www.karger.com/Services/SiteLicenses/": "KARGER", + "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", + "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", + "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET", + "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN", + "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", + "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY", + "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", + "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE", + "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC", + "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC", + "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", + "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA", + "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", + "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", + "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", + "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3", + "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2", + # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # //www.springer.com/tdm doesn't seem like a license + # //iopscience.iop.org/page/copyright is closed + # //www.acm.org/publications/policies/copyright_policy#Background is closed + # //rsc.li/journals-terms-of-use is closed for vor (am open) + # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!) + "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", + # skip these TDM licenses; they don't apply to content + # "//www.springer.com/tdm/": "SPRINGER-TDM", + # "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM", + # "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1", +} + +# Map various datacite type types to CSL-ish types. None means TODO or remove. +DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { + "ris": { + "THES": "thesis", + "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) + "CHAP": "chapter", + "FIGURE": "figure", + "RPRT": "report", + "JOUR": "article-journal", + "MPCT": "motion_picture", + "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset + "BOOK": "book", + "DATA": "dataset", + "COMP": "software", + }, + "schemaOrg": { + "Dataset": "dataset", + "Book": "book", + "ScholarlyArticle": "article-journal", + "ImageObject": "graphic", + "Collection": None, + "MediaObject": None, + "Event": None, + "SoftwareSourceCode": "software", + "Chapter": "chapter", + "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. + "PublicationIssue": "article", + "AudioObject": None, + "Thesis": "thesis", + }, + "citeproc": { + "article": "article", + "article-journal": "article-journal", + "article-magazine": "article-magazine", + "article-newspaper": "article-newspaper", + "bill": "bill", + "book": "book", + "broadcast": "broadcast", + "chapter": "chapter", + "dataset": "dataset", + "entry-dictionary": "entry-dictionary", + "entry-encyclopedia": "entry-encyclopedia", + "entry": "entry", + "figure": "figure", + "graphic": "graphic", + "interview": "interview", + "legal_case": "legal_case", + "legislation": "legislation", + "manuscript": "manuscript", + "map": "map", + "motion_picture": "motion_picture", + "musical_score": "musical_score", + "pamphlet": "pamphlet", + "paper-conference": "paper-conference", + "patent": "patent", + "personal_communication": "personal_communication", + "post": "post", + "post-weblog": "post-weblog", + "report": "report", + "review-book": "review-book", + "review": "review", + "song": "song", + "speech": "speech", + "thesis": "thesis", + "treaty": "treaty", + "webpage": "webpage", + }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types + "bibtex": { + "phdthesis": "thesis", + "inbook": "chapter", + "misc": None, + "article": "article-journal", + "book": "book", + }, + "resourceTypeGeneral": { + "Image": "graphic", + "Dataset": "dataset", + "PhysicalObject": None, + "Collection": None, + "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" + "Sound": None, + "InteractiveResource": None, + "Event": None, + "Software": "software", + "Other": None, + "Workflow": None, + "Audiovisual": None, + }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 +} |