""" This file contains lookup tables and other static data structures used in bibliographic metadata munging. """ from typing import Dict, Optional # These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of # 2/T and 2/B? # PubMed/MEDLINE and JSTOR use these MARC codes # https://www.loc.gov/marc/languages/language_name.html LANG_MAP_MARC: Dict[str, Optional[str]] = { "afr": "af", "alb": "sq", "amh": "am", "ara": "ar", "arm": "hy", "aze": "az", "ben": "bn", "bos": "bs", "bul": "bg", "cat": "ca", "chi": "zh", "cze": "cs", "dan": "da", "dut": "nl", "eng": "en", "epo": "eo", "est": "et", "fin": "fi", "fre": "fr", "geo": "ka", "ger": "de", "gla": "gd", "gre": "el", "heb": "he", "hin": "hi", "hrv": "hr", "hun": "hu", "ice": "is", "ind": "id", "ita": "it", "jpn": "ja", "kin": "rw", "kor": "ko", "lat": "la", "lav": "lv", "lit": "lt", "mac": "mk", "mal": "ml", "mao": "mi", "may": "ms", "nor": "no", "per": "fa", "per": "fa", "pol": "pl", "por": "pt", "pus": "ps", "rum": "ro", "rus": "ru", "san": "sa", "slo": "sk", "slv": "sl", "spa": "es", "srp": "sr", "swe": "sv", "tha": "th", "tur": "tr", "ukr": "uk", "urd": "ur", "vie": "vi", "wel": "cy", # additions "gle": "ga", # "Irish" (Gaelic) "jav": "jv", # Javanese "welsh": "cy", # Welsh "oci": "oc", # Occitan # Don't have ISO 639-1 codes "grc": "el", # Ancient Greek; map to modern greek "map": None, # Austronesian (collection) "syr": None, # Syriac, Modern "gem": None, # Old Saxon "non": None, # Old Norse "emg": None, # Eastern Meohang "neg": None, # Negidal "mul": None, # Multiple languages "und": None, # Undetermined } # these are mappings from web domains to URL 'rel' for things like file entity # URL notation DOMAIN_REL_MAP: Dict[str, str] = { "archive.org": "archive", # LOCKSS, Portico, DuraSpace, etc would also be "archive" "arxiv.org": "repository", "babel.hathitrust.org": "repository", "cds.cern.ch": "repository", "deepblue.lib.umich.edu": "repository", "europepmc.org": "repository", "hal.inria.fr": "repository", "scielo.isciii.es": "repository", "www.dtic.mil": "repository", "www.jstage.jst.go.jp": "repository", "www.jstor.org": "repository", "www.ncbi.nlm.nih.gov": "repository", "ftp.ncbi.nlm.nih.gov": "repository", "www.scielo.br": "repository", "www.scielo.cl": "repository", "www.scielo.org.mx": "repository", "zenodo.org": "repository", "www.biorxiv.org": "repository", "www.medrxiv.org": "repository", "citeseerx.ist.psu.edu": "aggregator", "publisher-connector.core.ac.uk": "aggregator", "core.ac.uk": "aggregator", "static.aminer.org": "aggregator", "aminer.org": "aggregator", "pdfs.semanticscholar.org": "aggregator", "semanticscholar.org": "aggregator", "www.semanticscholar.org": "aggregator", "academic.oup.com": "publisher", "cdn.elifesciences.org": "publisher", "cell.com": "publisher", "dl.acm.org": "publisher", "downloads.hindawi.com": "publisher", "elifesciences.org": "publisher", "iopscience.iop.org": "publisher", "journals.plos.org": "publisher", "link.springer.com": "publisher", "onlinelibrary.wiley.com": "publisher", "works.bepress.com": "publisher", "www.biomedcentral.com": "publisher", "www.cell.com": "publisher", "www.nature.com": "publisher", "www.pnas.org": "publisher", "www.tandfonline.com": "publisher", "www.frontiersin.org": "publisher", "www.degruyter.com": "publisher", "www.mdpi.com": "publisher", "www.ahajournals.org": "publisher", "ehp.niehs.nih.gov": "publisher", "journals.tsu.ru": "publisher", "www.cogentoa.com": "publisher", "www.researchgate.net": "academicsocial", "academia.edu": "academicsocial", "wayback.archive-it.org": "webarchive", "web.archive.org": "webarchive", "archive.is": "webarchive", } # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly PUBMED_RELEASE_TYPE_MAP = { # Adaptive Clinical Trial "Address": "speech", "Autobiography": "book", # Bibliography "Biography": "book", # Case Reports "Classical Article": "article-journal", # Clinical Conference # Clinical Study # Clinical Trial # Clinical Trial, Phase I # Clinical Trial, Phase II # Clinical Trial, Phase III # Clinical Trial, Phase IV # Clinical Trial Protocol # Clinical Trial, Veterinary # Collected Works # Comparative Study # Congress # Consensus Development Conference # Consensus Development Conference, NIH # Controlled Clinical Trial "Dataset": "dataset", # Dictionary # Directory # Duplicate Publication "Editorial": "editorial", # English Abstract # doesn't indicate that this is abstract-only # Equivalence Trial # Evaluation Studies # Expression of Concern # Festschrift # Government Document # Guideline "Historical Article": "article-journal", # Interactive Tutorial "Interview": "interview", "Introductory Journal Article": "article-journal", "Journal Article": "article-journal", "Lecture": "speech", "Legal Case": "legal_case", "Legislation": "legislation", "Letter": "letter", # Meta-Analysis # Multicenter Study # News "Newspaper Article": "article-newspaper", # Observational Study # Observational Study, Veterinary # Overall # Patient Education Handout # Periodical Index # Personal Narrative # Portrait # Practice Guideline # Pragmatic Clinical Trial # Publication Components # Publication Formats # Publication Type Category # Randomized Controlled Trial # Research Support, American Recovery and Reinvestment Act # Research Support, N.I.H., Extramural # Research Support, N.I.H., Intramural # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. # Research Support, U.S. Gov't, P.H.S. # Review # in the "literature review" sense, not "product review" # Scientific Integrity Review # Study Characteristics # Support of Research # Systematic Review "Technical Report": "report", # Twin Study # Validation Studies # Video-Audio Media # Webcasts } MONTH_ABBR_MAP: Dict[str, int] = { "Jan": 1, "01": 1, "Feb": 2, "02": 2, "Mar": 3, "03": 3, "Apr": 4, "04": 4, "May": 5, "05": 5, "Jun": 6, "06": 6, "Jul": 7, "07": 7, "Aug": 8, "08": 8, "Sep": 9, "09": 9, "Oct": 10, "10": 10, "Nov": 11, "11": 11, "Dec": 12, "12": 12, } # From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ COUNTRY_NAME_MAP: Dict[str, str] = { "Afghanistan": "af", "Albania": "al", "Algeria": "dz", "Andorra": "ad", "Angola": "ao", "Antigua and Barbuda": "ag", "Argentina": "ar", "Armenia": "am", "Australia": "au", "Austria": "at", "Azerbaijan": "az", "Bahamas": "bs", "Bahrain": "bh", "Bangladesh": "bd", "Barbados": "bb", "Belarus": "by", "Belgium": "be", "Belize": "bz", "Benin": "bj", "Bhutan": "bt", "Bolivia": "bo", "Bosnia and Herzegowina": "ba", "Botswana": "bw", "Brazil": "br", "Brunei Darussalam": "bn", "Bulgaria": "bg", "Burkina Faso": "bf", "Burundi": "bi", "Cambodia": "kh", "Cameroon": "cm", "Canada": "ca", "Cape Verde": "cv", "Central African Republic": "cf", "Chad": "td", "Chile": "cl", "China": "cn", "Colombia": "co", "Comoros": "km", "Congo, Democratic Republic": "cd", "Congo, People’s Republic": "cg", "Costa Rica": "cr", "Cote d'Ivoire": "ci", "Croatia (Local Name: Hrvatska)": "hr", "Cuba": "cu", "Cyprus": "cy", "Czech Republic": "cz", "Denmark": "dk", "Djibouti": "dj", "Dominica": "dm", "Dominican Republic": "do", "East Timor": "tl", "Ecuador": "ec", "El Salvador": "sv", "Equatorial Guinea": "gq", "Eritrea": "er", "Estonia": "ee", "Ethiopia": "et", "Fiji": "fj", "Finland": "fi", "France": "fr", "Gabon": "ga", "Gambia": "gm", "Georgia": "ge", "Germany": "de", "Ghana": "gh", "Greece": "gr", "Greenland": "gl", "Grenada": "gd", "Guatemala": "gt", "Guinea": "gn", "Guinea-Bissau": "gw", "Guyana": "gy", "Haiti": "ht", "Honduras": "hn", "Hong Kong": "hk", "Hungary": "hu", "Iceland": "is", "India": "in", "Indonesia": "id", "Iran": "ir", "Iraq": "iq", "Ireland": "ie", "Israel": "il", "Italy": "it", "Jamaica": "jm", "Japan": "jp", "Jordan": "jo", "Kazakhstan": "kz", "Kenya": "ke", "Kiribati": "ki", "Korea, Democratic People's Republic": "kp", "Korea, Republic": "kr", "Kuwait": "kw", "Kyrgyzstan": "kg", "Laos": "la", "Latvia": "lv", "Lebanon": "lb", "Lesotho": "ls", "Liberia": "lr", "Libya": "ly", "Liechtenstein": "li", "Lithuania": "lt", "Luxembourg": "lu", "Macedonia": "mk", "Madagascar": "mg", "Malawi": "mw", "Malaysia": "my", "Maldives": "mv", "Mali": "ml", "Malta": "mt", "Marshall Islands": "mh", "Mauritania": "mr", "Mauritius": "mu", "Mexico": "mx", "Micronesia": "fm", "Moldova": "md", "Monaco": "mc", "Mongolia": "mn", "Morocco": "ma", "Mozambique": "mz", "Myanmar": "mm", "Namibia": "na", "Nauru": "nr", "Nepal": "np", "Netherlands": "nl", "New Zealand": "nz", "Nicaragua": "ni", "Niger": "ne", "Nigeria": "ng", "Norway": "no", "Oman": "om", "Pakistan": "pk", "Palau": "pw", "Panama": "pa", "Papua New Guinea": "pg", "Paraguay": "py", "Peru": "pe", "Philippines": "ph", "Poland": "pl", "Portugal": "pt", "Puerto Rico": "pr", "Qatar": "qa", "Romania": "ro", "Russian Federation": "ru", "Rwanda": "rw", "Saint Kitts and Nevis": "kn", "Saint Lucia": "lc", "Saint Vincent and the Grenadines": "vc", "Samoa": "ws", "San Marino": "sm", "Sao Tome and Príncipe": "st", "Saudi Arabia": "sa", "Senegal": "sn", "Serbia and Montenegro": "cs", "Seychelles": "sc", "Sierra Leone": "sl", "Singapore": "sg", "Slovakia (Slovak Republic)": "sk", "Slovenia": "si", "Solomon Islands": "sb", "Somalia": "so", "South Africa": "za", "Spain": "es", "Sri Lanka": "lk", "Sudan": "sd", "Suriname": "sr", "Swaziland": "sz", "Sweden": "se", "Switzerland": "ch", "Syrian Arab Republic": "sy", "Taiwan": "tw", "Tajikistan": "tj", "Tanzania": "tz", "Tanzania": "tz", "Thailand": "th", "Togo": "tg", "Tonga": "to", "Trinidad and Tobago": "tt", "Tunisia": "tn", "Turkey": "tr", "Turkmenistan": "tm", "Tuvalu": "tv", "Uganda": "ug", "Ukraine": "ua", "United Arab Emirates": "ae", "United Kingdom": "gb", "United States": "us", "Uruguay": "uy", # Additions from running over large files "Bosnia and Herzegovina": "ba", # "International" "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn "Russia (Federation)": "ru", "Scotland": "gb", "England": "gb", "Korea (South)": "kr", "Georgia (Republic)": "ge", "Egypt": "eg", } CONTAINER_TYPE_MAP: Dict[str, str] = { "article-journal": "journal", "paper-conference": "conference", "book": "book-series", } # These are based, informally, on sorting the most popular licenses found in # Crossref metadata. There were over 500 unique strings and only a few most # popular are here; many were variants of the CC URLs. Would be useful to # normalize CC licenses better. # The current norm is to only add license slugs that are at least partially OA. # NOTE: URL patterns should be lower-case, and have any trailing slash ("/") # removed. Slugs are usually upper-case acronyms LICENSE_SLUG_MAP: Dict[str, str] = { "//creativecommons.org/publicdomain/mark/1.0": "CC-0", "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", "//creativecommons.org/publicdomain/zero/1.0": "CC-0", "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0", "//creativecommons.org/licenses/by/2.0": "CC-BY", "//creativecommons.org/licenses/by/3.0": "CC-BY", "//creativecommons.org/licenses/by/4.0": "CC-BY", "//creativecommons.org/licenses/by-sa/3.0": "CC-BY-SA", "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA", "//creativecommons.org/licenses/by-nd/3.0": "CC-BY-ND", "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND", "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC", "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC", "//creativecommons.org/licenses/by-nc-sa/3.0": "CC-BY-NC-SA", "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA", "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND", "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND", "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0", "//spdx.org/licenses/cc0-1.0.json": "CC-0", "//spdx.org/licenses/cc-by-1.0.json": "CC-BY", "//spdx.org/licenses/cc-by-4.0.json": "CC-BY", "//spdx.org/licenses/cc-by-nc-4.0.json": "CC-BY-NC", "//spdx.org/licenses/cc-by-sa-3.0.json": "CC-BY-SA", "//spdx.org/licenses/cc-by-sa-4.0.json": "CC-BY-SA", "//spdx.org/licenses/mit.json": "MIT", "//spdx.org/licenses/ogl-canada-2.0.json": "OGL-Canada", "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0", "//www.elsevier.com/tdm/userlicense/1.0": "ELSEVIER-USER-1.0", "//www.karger.com/services/siteLicenses": "KARGER", "//www.karger.com/services/siteLicenses": "KARGER", "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK", "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK", "//archaeologydataservice.ac.uk/advice/termsofuseandaccess": "ADS-UK", "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET", "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET", "//publikationen.bibliothek.kit.edu/kitopen-lizenz": "KIT-OPEN", "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", "//www.ametsoc.org/pubsreuselicenses": "AMETSOC", "//www.ametsoc.org/pubsreuselicenses": "AMETSOC", "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3", "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2", "//arxiv.org/licenses/nonexclusive-distrib/1.0": "ARXIV-1.0", # skip these non-OA licenses # //iopscience.iop.org/page/copyright is closed # //www.acm.org/publications/policies/copyright_policy#Background is closed # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!) # skip these TDM licenses; they don't apply to content # "//www.springer.com/tdm": "SPRINGER-TDM", # "//journals.sagepub.com/page/policies/text-and-data-mining-license": "SAGE-TDM", # "//doi.wiley.com/10.1002/tdm_license_1.1": "WILEY-TDM-1.1", # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license # //www.springer.com/tdm doesn't seem like a license # //rsc.li/journals-terms-of-use is closed for vor (am open) } # Map various datacite type types to CSL-ish types. None means TODO or remove. DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { "ris": { "THES": "thesis", "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) "CHAP": "chapter", "FIGURE": "figure", "RPRT": "report", "JOUR": "article-journal", "MPCT": "motion_picture", "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset "BOOK": "book", "DATA": "dataset", "COMP": "software", }, "schemaOrg": { "Dataset": "dataset", "Book": "book", "ScholarlyArticle": "article-journal", "ImageObject": "graphic", "Collection": None, "MediaObject": None, "Event": None, "SoftwareSourceCode": "software", "Chapter": "chapter", "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. "PublicationIssue": "article", "AudioObject": None, "Thesis": "thesis", }, "citeproc": { "article": "article", "article-journal": "article-journal", "article-magazine": "article-magazine", "article-newspaper": "article-newspaper", "bill": "bill", "book": "book", "broadcast": "broadcast", "chapter": "chapter", "dataset": "dataset", "entry-dictionary": "entry-dictionary", "entry-encyclopedia": "entry-encyclopedia", "entry": "entry", "figure": "figure", "graphic": "graphic", "interview": "interview", "legal_case": "legal_case", "legislation": "legislation", "manuscript": "manuscript", "map": "map", "motion_picture": "motion_picture", "musical_score": "musical_score", "pamphlet": "pamphlet", "paper-conference": "paper-conference", "patent": "patent", "personal_communication": "personal_communication", "post": "post", "post-weblog": "post-weblog", "report": "report", "review-book": "review-book", "review": "review", "song": "song", "speech": "speech", "thesis": "thesis", "treaty": "treaty", "webpage": "webpage", }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types "bibtex": { "phdthesis": "thesis", "inbook": "chapter", "misc": None, "article": "article-journal", "book": "book", }, "resourceTypeGeneral": { "Image": "graphic", "Dataset": "dataset", "PhysicalObject": None, "Collection": None, "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" "Sound": None, "InteractiveResource": None, "Event": None, "Software": "software", "Other": None, "Workflow": None, "Audiovisual": None, }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 }