aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:52:39 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:52:43 -0800
commitddc757bc1d5c610f42e9f5f10a4f060f517b66ca (patch)
tree86ccdef998bd3da3910cfe8fb9f2177b58a664a0 /python/fatcat_tools
parent16e9979a6f347b49764c1141209e84083ea81057 (diff)
downloadfatcat-ddc757bc1d5c610f42e9f5f10a4f060f517b66ca.tar.gz
fatcat-ddc757bc1d5c610f42e9f5f10a4f060f517b66ca.zip
refactor importer metadata tables into separate file; move some helpers around
- MAX_ABSTRACT_LENGTH set in a single place (importer common) - merge datacite license slug table in to common table, removing some TDM-specific licenses (which do not apply in the context of preserving the full work)
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/biblio_lookup_tables.py623
-rw-r--r--python/fatcat_tools/importers/__init__.py3
-rw-r--r--python/fatcat_tools/importers/common.py61
-rw-r--r--python/fatcat_tools/importers/crossref.py94
-rw-r--r--python/fatcat_tools/importers/datacite.py155
-rw-r--r--python/fatcat_tools/importers/doaj_article.py5
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py6
-rw-r--r--python/fatcat_tools/importers/jstor.py3
-rw-r--r--python/fatcat_tools/importers/pubmed.py319
-rw-r--r--python/fatcat_tools/normal.py115
10 files changed, 682 insertions, 702 deletions
diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py
new file mode 100644
index 00000000..a9a097ae
--- /dev/null
+++ b/python/fatcat_tools/biblio_lookup_tables.py
@@ -0,0 +1,623 @@
+"""
+This file contains lookup tables and other static data structures used in
+bibliographic metadata munging.
+"""
+
+from typing import Dict, Optional
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC: Dict[str, Optional[str]] = {
+ "afr": "af",
+ "alb": "sq",
+ "amh": "am",
+ "ara": "ar",
+ "arm": "hy",
+ "aze": "az",
+ "ben": "bn",
+ "bos": "bs",
+ "bul": "bg",
+ "cat": "ca",
+ "chi": "zh",
+ "cze": "cs",
+ "dan": "da",
+ "dut": "nl",
+ "eng": "en",
+ "epo": "eo",
+ "est": "et",
+ "fin": "fi",
+ "fre": "fr",
+ "geo": "ka",
+ "ger": "de",
+ "gla": "gd",
+ "gre": "el",
+ "heb": "he",
+ "hin": "hi",
+ "hrv": "hr",
+ "hun": "hu",
+ "ice": "is",
+ "ind": "id",
+ "ita": "it",
+ "jpn": "ja",
+ "kin": "rw",
+ "kor": "ko",
+ "lat": "la",
+ "lav": "lv",
+ "lit": "lt",
+ "mac": "mk",
+ "mal": "ml",
+ "mao": "mi",
+ "may": "ms",
+ "nor": "no",
+ "per": "fa",
+ "per": "fa",
+ "pol": "pl",
+ "por": "pt",
+ "pus": "ps",
+ "rum": "ro",
+ "rus": "ru",
+ "san": "sa",
+ "slo": "sk",
+ "slv": "sl",
+ "spa": "es",
+ "srp": "sr",
+ "swe": "sv",
+ "tha": "th",
+ "tur": "tr",
+ "ukr": "uk",
+ "urd": "ur",
+ "vie": "vi",
+ "wel": "cy",
+ # additions
+ "gle": "ga", # "Irish" (Gaelic)
+ "jav": "jv", # Javanese
+ "welsh": "cy", # Welsh
+ "oci": "oc", # Occitan
+ # Don't have ISO 639-1 codes
+ "grc": "el", # Ancient Greek; map to modern greek
+ "map": None, # Austronesian (collection)
+ "syr": None, # Syriac, Modern
+ "gem": None, # Old Saxon
+ "non": None, # Old Norse
+ "emg": None, # Eastern Meohang
+ "neg": None, # Negidal
+ "mul": None, # Multiple languages
+ "und": None, # Undetermined
+}
+
+# these are mappings from web domains to URL 'rel' for things like file entity
+# URL notation
+DOMAIN_REL_MAP: Dict[str, str] = {
+ "archive.org": "archive",
+ # LOCKSS, Portico, DuraSpace, etc would also be "archive"
+ "arxiv.org": "repository",
+ "babel.hathitrust.org": "repository",
+ "cds.cern.ch": "repository",
+ "deepblue.lib.umich.edu": "repository",
+ "europepmc.org": "repository",
+ "hal.inria.fr": "repository",
+ "scielo.isciii.es": "repository",
+ "www.dtic.mil": "repository",
+ "www.jstage.jst.go.jp": "repository",
+ "www.jstor.org": "repository",
+ "www.ncbi.nlm.nih.gov": "repository",
+ "ftp.ncbi.nlm.nih.gov": "repository",
+ "www.scielo.br": "repository",
+ "www.scielo.cl": "repository",
+ "www.scielo.org.mx": "repository",
+ "zenodo.org": "repository",
+ "www.biorxiv.org": "repository",
+ "www.medrxiv.org": "repository",
+ "citeseerx.ist.psu.edu": "aggregator",
+ "publisher-connector.core.ac.uk": "aggregator",
+ "core.ac.uk": "aggregator",
+ "static.aminer.org": "aggregator",
+ "aminer.org": "aggregator",
+ "pdfs.semanticscholar.org": "aggregator",
+ "semanticscholar.org": "aggregator",
+ "www.semanticscholar.org": "aggregator",
+ "academic.oup.com": "publisher",
+ "cdn.elifesciences.org": "publisher",
+ "cell.com": "publisher",
+ "dl.acm.org": "publisher",
+ "downloads.hindawi.com": "publisher",
+ "elifesciences.org": "publisher",
+ "iopscience.iop.org": "publisher",
+ "journals.plos.org": "publisher",
+ "link.springer.com": "publisher",
+ "onlinelibrary.wiley.com": "publisher",
+ "works.bepress.com": "publisher",
+ "www.biomedcentral.com": "publisher",
+ "www.cell.com": "publisher",
+ "www.nature.com": "publisher",
+ "www.pnas.org": "publisher",
+ "www.tandfonline.com": "publisher",
+ "www.frontiersin.org": "publisher",
+ "www.degruyter.com": "publisher",
+ "www.mdpi.com": "publisher",
+ "www.ahajournals.org": "publisher",
+ "ehp.niehs.nih.gov": "publisher",
+ "journals.tsu.ru": "publisher",
+ "www.cogentoa.com": "publisher",
+ "www.researchgate.net": "academicsocial",
+ "academia.edu": "academicsocial",
+ "wayback.archive-it.org": "webarchive",
+ "web.archive.org": "webarchive",
+ "archive.is": "webarchive",
+}
+
+# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
+PUBMED_RELEASE_TYPE_MAP = {
+ # Adaptive Clinical Trial
+ "Address": "speech",
+ "Autobiography": "book",
+ # Bibliography
+ "Biography": "book",
+ # Case Reports
+ "Classical Article": "article-journal",
+ # Clinical Conference
+ # Clinical Study
+ # Clinical Trial
+ # Clinical Trial, Phase I
+ # Clinical Trial, Phase II
+ # Clinical Trial, Phase III
+ # Clinical Trial, Phase IV
+ # Clinical Trial Protocol
+ # Clinical Trial, Veterinary
+ # Collected Works
+ # Comparative Study
+ # Congress
+ # Consensus Development Conference
+ # Consensus Development Conference, NIH
+ # Controlled Clinical Trial
+ "Dataset": "dataset",
+ # Dictionary
+ # Directory
+ # Duplicate Publication
+ "Editorial": "editorial",
+ # English Abstract # doesn't indicate that this is abstract-only
+ # Equivalence Trial
+ # Evaluation Studies
+ # Expression of Concern
+ # Festschrift
+ # Government Document
+ # Guideline
+ "Historical Article": "article-journal",
+ # Interactive Tutorial
+ "Interview": "interview",
+ "Introductory Journal Article": "article-journal",
+ "Journal Article": "article-journal",
+ "Lecture": "speech",
+ "Legal Case": "legal_case",
+ "Legislation": "legislation",
+ "Letter": "letter",
+ # Meta-Analysis
+ # Multicenter Study
+ # News
+ "Newspaper Article": "article-newspaper",
+ # Observational Study
+ # Observational Study, Veterinary
+ # Overall
+ # Patient Education Handout
+ # Periodical Index
+ # Personal Narrative
+ # Portrait
+ # Practice Guideline
+ # Pragmatic Clinical Trial
+ # Publication Components
+ # Publication Formats
+ # Publication Type Category
+ # Randomized Controlled Trial
+ # Research Support, American Recovery and Reinvestment Act
+ # Research Support, N.I.H., Extramural
+ # Research Support, N.I.H., Intramural
+ # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+ # Research Support, U.S. Gov't, P.H.S.
+ # Review # in the "literature review" sense, not "product review"
+ # Scientific Integrity Review
+ # Study Characteristics
+ # Support of Research
+ # Systematic Review
+ "Technical Report": "report",
+ # Twin Study
+ # Validation Studies
+ # Video-Audio Media
+ # Webcasts
+}
+
+MONTH_ABBR_MAP: Dict[str, int] = {
+ "Jan": 1,
+ "01": 1,
+ "Feb": 2,
+ "02": 2,
+ "Mar": 3,
+ "03": 3,
+ "Apr": 4,
+ "04": 4,
+ "May": 5,
+ "05": 5,
+ "Jun": 6,
+ "06": 6,
+ "Jul": 7,
+ "07": 7,
+ "Aug": 8,
+ "08": 8,
+ "Sep": 9,
+ "09": 9,
+ "Oct": 10,
+ "10": 10,
+ "Nov": 11,
+ "11": 11,
+ "Dec": 12,
+ "12": 12,
+}
+
+# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
+COUNTRY_NAME_MAP: Dict[str, str] = {
+ "Afghanistan": "af",
+ "Albania": "al",
+ "Algeria": "dz",
+ "Andorra": "ad",
+ "Angola": "ao",
+ "Antigua and Barbuda": "ag",
+ "Argentina": "ar",
+ "Armenia": "am",
+ "Australia": "au",
+ "Austria": "at",
+ "Azerbaijan": "az",
+ "Bahamas": "bs",
+ "Bahrain": "bh",
+ "Bangladesh": "bd",
+ "Barbados": "bb",
+ "Belarus": "by",
+ "Belgium": "be",
+ "Belize": "bz",
+ "Benin": "bj",
+ "Bhutan": "bt",
+ "Bolivia": "bo",
+ "Bosnia and Herzegowina": "ba",
+ "Botswana": "bw",
+ "Brazil": "br",
+ "Brunei Darussalam": "bn",
+ "Bulgaria": "bg",
+ "Burkina Faso": "bf",
+ "Burundi": "bi",
+ "Cambodia": "kh",
+ "Cameroon": "cm",
+ "Canada": "ca",
+ "Cape Verde": "cv",
+ "Central African Republic": "cf",
+ "Chad": "td",
+ "Chile": "cl",
+ "China": "cn",
+ "Colombia": "co",
+ "Comoros": "km",
+ "Congo, Democratic Republic": "cd",
+ "Congo, People’s Republic": "cg",
+ "Costa Rica": "cr",
+ "Cote d'Ivoire": "ci",
+ "Croatia (Local Name: Hrvatska)": "hr",
+ "Cuba": "cu",
+ "Cyprus": "cy",
+ "Czech Republic": "cz",
+ "Denmark": "dk",
+ "Djibouti": "dj",
+ "Dominica": "dm",
+ "Dominican Republic": "do",
+ "East Timor": "tl",
+ "Ecuador": "ec",
+ "El Salvador": "sv",
+ "Equatorial Guinea": "gq",
+ "Eritrea": "er",
+ "Estonia": "ee",
+ "Ethiopia": "et",
+ "Fiji": "fj",
+ "Finland": "fi",
+ "France": "fr",
+ "Gabon": "ga",
+ "Gambia": "gm",
+ "Georgia": "ge",
+ "Germany": "de",
+ "Ghana": "gh",
+ "Greece": "gr",
+ "Greenland": "gl",
+ "Grenada": "gd",
+ "Guatemala": "gt",
+ "Guinea": "gn",
+ "Guinea-Bissau": "gw",
+ "Guyana": "gy",
+ "Haiti": "ht",
+ "Honduras": "hn",
+ "Hong Kong": "hk",
+ "Hungary": "hu",
+ "Iceland": "is",
+ "India": "in",
+ "Indonesia": "id",
+ "Iran": "ir",
+ "Iraq": "iq",
+ "Ireland": "ie",
+ "Israel": "il",
+ "Italy": "it",
+ "Jamaica": "jm",
+ "Japan": "jp",
+ "Jordan": "jo",
+ "Kazakhstan": "kz",
+ "Kenya": "ke",
+ "Kiribati": "ki",
+ "Korea, Democratic People's Republic": "kp",
+ "Korea, Republic": "kr",
+ "Kuwait": "kw",
+ "Kyrgyzstan": "kg",
+ "Laos": "la",
+ "Latvia": "lv",
+ "Lebanon": "lb",
+ "Lesotho": "ls",
+ "Liberia": "lr",
+ "Libya": "ly",
+ "Liechtenstein": "li",
+ "Lithuania": "lt",
+ "Luxembourg": "lu",
+ "Macedonia": "mk",
+ "Madagascar": "mg",
+ "Malawi": "mw",
+ "Malaysia": "my",
+ "Maldives": "mv",
+ "Mali": "ml",
+ "Malta": "mt",
+ "Marshall Islands": "mh",
+ "Mauritania": "mr",
+ "Mauritius": "mu",
+ "Mexico": "mx",
+ "Micronesia": "fm",
+ "Moldova": "md",
+ "Monaco": "mc",
+ "Mongolia": "mn",
+ "Morocco": "ma",
+ "Mozambique": "mz",
+ "Myanmar": "mm",
+ "Namibia": "na",
+ "Nauru": "nr",
+ "Nepal": "np",
+ "Netherlands": "nl",
+ "New Zealand": "nz",
+ "Nicaragua": "ni",
+ "Niger": "ne",
+ "Nigeria": "ng",
+ "Norway": "no",
+ "Oman": "om",
+ "Pakistan": "pk",
+ "Palau": "pw",
+ "Panama": "pa",
+ "Papua New Guinea": "pg",
+ "Paraguay": "py",
+ "Peru": "pe",
+ "Philippines": "ph",
+ "Poland": "pl",
+ "Portugal": "pt",
+ "Puerto Rico": "pr",
+ "Qatar": "qa",
+ "Romania": "ro",
+ "Russian Federation": "ru",
+ "Rwanda": "rw",
+ "Saint Kitts and Nevis": "kn",
+ "Saint Lucia": "lc",
+ "Saint Vincent and the Grenadines": "vc",
+ "Samoa": "ws",
+ "San Marino": "sm",
+ "Sao Tome and Príncipe": "st",
+ "Saudi Arabia": "sa",
+ "Senegal": "sn",
+ "Serbia and Montenegro": "cs",
+ "Seychelles": "sc",
+ "Sierra Leone": "sl",
+ "Singapore": "sg",
+ "Slovakia (Slovak Republic)": "sk",
+ "Slovenia": "si",
+ "Solomon Islands": "sb",
+ "Somalia": "so",
+ "South Africa": "za",
+ "Spain": "es",
+ "Sri Lanka": "lk",
+ "Sudan": "sd",
+ "Suriname": "sr",
+ "Swaziland": "sz",
+ "Sweden": "se",
+ "Switzerland": "ch",
+ "Syrian Arab Republic": "sy",
+ "Taiwan": "tw",
+ "Tajikistan": "tj",
+ "Tanzania": "tz",
+ "Tanzania": "tz",
+ "Thailand": "th",
+ "Togo": "tg",
+ "Tonga": "to",
+ "Trinidad and Tobago": "tt",
+ "Tunisia": "tn",
+ "Turkey": "tr",
+ "Turkmenistan": "tm",
+ "Tuvalu": "tv",
+ "Uganda": "ug",
+ "Ukraine": "ua",
+ "United Arab Emirates": "ae",
+ "United Kingdom": "gb",
+ "United States": "us",
+ "Uruguay": "uy",
+ # Additions from running over large files
+ "Bosnia and Herzegovina": "ba",
+ # "International"
+ "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
+ "Russia (Federation)": "ru",
+ "Scotland": "gb",
+ "England": "gb",
+ "Korea (South)": "kr",
+ "Georgia (Republic)": "ge",
+ "Egypt": "eg",
+}
+
+CONTAINER_TYPE_MAP: Dict[str, str] = {
+ "article-journal": "journal",
+ "paper-conference": "conference",
+ "book": "book-series",
+}
+
+# These are based, informally, on sorting the most popular licenses found in
+# Crossref metadata. There were over 500 unique strings and only a few most
+# popular are here; many were variants of the CC URLs. Would be useful to
+# normalize CC licenses better.
+# The current norm is to only add license slugs that are at least partially OA.
+LICENSE_SLUG_MAP: Dict[str, str] = {
+ "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
+ "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
+ "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+ "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+ "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
+ "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
+ "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+ "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
+ "//creativecommons.org/licenses/by/2.0/": "CC-BY",
+ "//creativecommons.org/licenses/by/3.0/": "CC-BY",
+ "//creativecommons.org/licenses/by/4.0/": "CC-BY",
+ "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+ "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+ "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+ "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+ "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+ "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+ "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+ "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+ "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+ "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+ "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
+ "//spdx.org/licenses/CC0-1.0.json": "CC-0",
+ "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
+ "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
+ "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
+ "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
+ "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
+ "//spdx.org/licenses/MIT.json": "MIT",
+ "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
+ "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+ "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
+ "//www.karger.com/Services/SiteLicenses": "KARGER",
+ "//www.karger.com/Services/SiteLicenses/": "KARGER",
+ "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
+ "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
+ "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
+ "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
+ "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
+ "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
+ "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
+ "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
+ "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
+ "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
+ "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
+ "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
+ "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+ "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+ "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+ "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
+ "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
+ # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+ # //www.springer.com/tdm doesn't seem like a license
+ # //iopscience.iop.org/page/copyright is closed
+ # //www.acm.org/publications/policies/copyright_policy#Background is closed
+ # //rsc.li/journals-terms-of-use is closed for vor (am open)
+ # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
+ "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
+ # skip these TDM licenses; they don't apply to content
+ # "//www.springer.com/tdm/": "SPRINGER-TDM",
+ # "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
+ # "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
+}
+
+# Map various datacite type types to CSL-ish types. None means TODO or remove.
+DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
+ "ris": {
+ "THES": "thesis",
+ "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report)
+ "CHAP": "chapter",
+ "FIGURE": "figure",
+ "RPRT": "report",
+ "JOUR": "article-journal",
+ "MPCT": "motion_picture",
+ "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset
+ "BOOK": "book",
+ "DATA": "dataset",
+ "COMP": "software",
+ },
+ "schemaOrg": {
+ "Dataset": "dataset",
+ "Book": "book",
+ "ScholarlyArticle": "article-journal",
+ "ImageObject": "graphic",
+ "Collection": None,
+ "MediaObject": None,
+ "Event": None,
+ "SoftwareSourceCode": "software",
+ "Chapter": "chapter",
+ "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
+ "PublicationIssue": "article",
+ "AudioObject": None,
+ "Thesis": "thesis",
+ },
+ "citeproc": {
+ "article": "article",
+ "article-journal": "article-journal",
+ "article-magazine": "article-magazine",
+ "article-newspaper": "article-newspaper",
+ "bill": "bill",
+ "book": "book",
+ "broadcast": "broadcast",
+ "chapter": "chapter",
+ "dataset": "dataset",
+ "entry-dictionary": "entry-dictionary",
+ "entry-encyclopedia": "entry-encyclopedia",
+ "entry": "entry",
+ "figure": "figure",
+ "graphic": "graphic",
+ "interview": "interview",
+ "legal_case": "legal_case",
+ "legislation": "legislation",
+ "manuscript": "manuscript",
+ "map": "map",
+ "motion_picture": "motion_picture",
+ "musical_score": "musical_score",
+ "pamphlet": "pamphlet",
+ "paper-conference": "paper-conference",
+ "patent": "patent",
+ "personal_communication": "personal_communication",
+ "post": "post",
+ "post-weblog": "post-weblog",
+ "report": "report",
+ "review-book": "review-book",
+ "review": "review",
+ "song": "song",
+ "speech": "speech",
+ "thesis": "thesis",
+ "treaty": "treaty",
+ "webpage": "webpage",
+ }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
+ "bibtex": {
+ "phdthesis": "thesis",
+ "inbook": "chapter",
+ "misc": None,
+ "article": "article-journal",
+ "book": "book",
+ },
+ "resourceTypeGeneral": {
+ "Image": "graphic",
+ "Dataset": "dataset",
+ "PhysicalObject": None,
+ "Collection": None,
+ "Text": None, # "Greyliterature, labnotes, accompanyingmaterials"
+ "Sound": None,
+ "InteractiveResource": None,
+ "Event": None,
+ "Software": "software",
+ "Other": None,
+ "Workflow": None,
+ "Audiovisual": None,
+ }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
+}
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 4d4d696b..654be2e9 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -26,9 +26,8 @@ from .common import (
KafkaJsonPusher,
LinePusher,
SqlitePusher,
- make_kafka_consumer,
)
-from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
+from .crossref import CrossrefImporter
from .datacite import DataciteImporter
from .dblp_container import DblpContainerImporter
from .dblp_release import DblpReleaseImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 56c3d32e..7c587395 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,71 +27,14 @@ from fatcat_openapi_client import (
from fatcat_openapi_client.rest import ApiException
from fuzzycat.matching import match_release_fuzzy
+from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP
from fatcat_tools.normal import clean_doi
from fatcat_tools.transforms import entity_to_dict
DATE_FMT: str = "%Y-%m-%d"
SANE_MAX_RELEASES: int = 200
SANE_MAX_URLS: int = 100
-
-DOMAIN_REL_MAP: Dict[str, str] = {
- "archive.org": "archive",
- # LOCKSS, Portico, DuraSpace, etc would also be "archive"
- "arxiv.org": "repository",
- "babel.hathitrust.org": "repository",
- "cds.cern.ch": "repository",
- "deepblue.lib.umich.edu": "repository",
- "europepmc.org": "repository",
- "hal.inria.fr": "repository",
- "scielo.isciii.es": "repository",
- "www.dtic.mil": "repository",
- "www.jstage.jst.go.jp": "repository",
- "www.jstor.org": "repository",
- "www.ncbi.nlm.nih.gov": "repository",
- "ftp.ncbi.nlm.nih.gov": "repository",
- "www.scielo.br": "repository",
- "www.scielo.cl": "repository",
- "www.scielo.org.mx": "repository",
- "zenodo.org": "repository",
- "www.biorxiv.org": "repository",
- "www.medrxiv.org": "repository",
- "citeseerx.ist.psu.edu": "aggregator",
- "publisher-connector.core.ac.uk": "aggregator",
- "core.ac.uk": "aggregator",
- "static.aminer.org": "aggregator",
- "aminer.org": "aggregator",
- "pdfs.semanticscholar.org": "aggregator",
- "semanticscholar.org": "aggregator",
- "www.semanticscholar.org": "aggregator",
- "academic.oup.com": "publisher",
- "cdn.elifesciences.org": "publisher",
- "cell.com": "publisher",
- "dl.acm.org": "publisher",
- "downloads.hindawi.com": "publisher",
- "elifesciences.org": "publisher",
- "iopscience.iop.org": "publisher",
- "journals.plos.org": "publisher",
- "link.springer.com": "publisher",
- "onlinelibrary.wiley.com": "publisher",
- "works.bepress.com": "publisher",
- "www.biomedcentral.com": "publisher",
- "www.cell.com": "publisher",
- "www.nature.com": "publisher",
- "www.pnas.org": "publisher",
- "www.tandfonline.com": "publisher",
- "www.frontiersin.org": "publisher",
- "www.degruyter.com": "publisher",
- "www.mdpi.com": "publisher",
- "www.ahajournals.org": "publisher",
- "ehp.niehs.nih.gov": "publisher",
- "journals.tsu.ru": "publisher",
- "www.cogentoa.com": "publisher",
- "www.researchgate.net": "academicsocial",
- "academia.edu": "academicsocial",
- "wayback.archive-it.org": "webarchive",
- "web.archive.org": "webarchive",
- "archive.is": "webarchive",
-}
+MAX_ABSTRACT_LENGTH: int = 2048
def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 8f5a4265..52bd7465 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,7 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
from .common import EntityImporter
@@ -33,97 +34,6 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
"standard": "standard",
}
-CONTAINER_TYPE_MAP: Dict[str, str] = {
- "article-journal": "journal",
- "paper-conference": "conference",
- "book": "book-series",
-}
-
-# These are based, informally, on sorting the most popular licenses found in
-# Crossref metadata. There were over 500 unique strings and only a few most
-# popular are here; many were variants of the CC URLs. Would be useful to
-# normalize CC licenses better.
-# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP: Dict[str, str] = {
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//creativecommons.org/licenses/by/2.0/": "CC-BY",
- "//creativecommons.org/licenses/by/3.0/": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/": "CC-BY",
- "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.karger.com/Services/SiteLicenses": "KARGER",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
- "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
- "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
- # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
- # //www.springer.com/tdm doesn't seem like a license
- # //iopscience.iop.org/page/copyright is closed
- # //www.acm.org/publications/policies/copyright_policy#Background is closed
- # //rsc.li/journals-terms-of-use is closed for vor (am open)
- # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-}
-
-
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
- if not raw:
- return None
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if "creativecommons.org" in raw.lower():
- raw = raw.lower()
- raw = raw.replace("/legalcode", "/").replace("/uk", "")
- if not raw.endswith("/"):
- raw = raw + "/"
- return LICENSE_SLUG_MAP.get(raw)
-
-
-def test_lookup_license_slug() -> None:
-
- assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
- assert (
- lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
- == "CC-BY"
- )
- assert (
- lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
- == "CC-0"
- )
- assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
- assert (
- lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
- == "CC-BY-NC-SA"
- )
- assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
- assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
- assert lookup_license_slug("") is None
- assert lookup_license_slug(None) is None
-
class CrossrefImporter(EntityImporter):
"""
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 441514b8..b310f8bc 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,113 +21,19 @@ import langdetect
import pycountry
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
from fatcat_tools.transforms import entity_to_dict
-from .common import EntityImporter
-
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP: Dict[str, str] = {
+DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {
"Journal": "journal",
"Series": "journal",
"Book Series": "book-series",
}
-# The docs/guide should be the canonical home for these mappings; update there
-# first. Map various datacite type types to CSL-ish types. None means TODO or
-# remove.
-DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
- "ris": {
- "THES": "thesis",
- "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report)
- "CHAP": "chapter",
- "FIGURE": "figure",
- "RPRT": "report",
- "JOUR": "article-journal",
- "MPCT": "motion_picture",
- "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset
- "BOOK": "book",
- "DATA": "dataset",
- "COMP": "software",
- },
- "schemaOrg": {
- "Dataset": "dataset",
- "Book": "book",
- "ScholarlyArticle": "article-journal",
- "ImageObject": "graphic",
- "Collection": None,
- "MediaObject": None,
- "Event": None,
- "SoftwareSourceCode": "software",
- "Chapter": "chapter",
- "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
- "PublicationIssue": "article",
- "AudioObject": None,
- "Thesis": "thesis",
- },
- "citeproc": {
- "article": "article",
- "article-journal": "article-journal",
- "article-magazine": "article-magazine",
- "article-newspaper": "article-newspaper",
- "bill": "bill",
- "book": "book",
- "broadcast": "broadcast",
- "chapter": "chapter",
- "dataset": "dataset",
- "entry-dictionary": "entry-dictionary",
- "entry-encyclopedia": "entry-encyclopedia",
- "entry": "entry",
- "figure": "figure",
- "graphic": "graphic",
- "interview": "interview",
- "legal_case": "legal_case",
- "legislation": "legislation",
- "manuscript": "manuscript",
- "map": "map",
- "motion_picture": "motion_picture",
- "musical_score": "musical_score",
- "pamphlet": "pamphlet",
- "paper-conference": "paper-conference",
- "patent": "patent",
- "personal_communication": "personal_communication",
- "post": "post",
- "post-weblog": "post-weblog",
- "report": "report",
- "review-book": "review-book",
- "review": "review",
- "song": "song",
- "speech": "speech",
- "thesis": "thesis",
- "treaty": "treaty",
- "webpage": "webpage",
- }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
- "bibtex": {
- "phdthesis": "thesis",
- "inbook": "chapter",
- "misc": None,
- "article": "article-journal",
- "book": "book",
- },
- "resourceTypeGeneral": {
- "Image": "graphic",
- "Dataset": "dataset",
- "PhysicalObject": None,
- "Collection": None,
- "Text": None, # "Greyliterature, labnotes, accompanyingmaterials"
- "Sound": None,
- "InteractiveResource": None,
- "Event": None,
- "Software": "software",
- "Other": None,
- "Workflow": None,
- "Audiovisual": None,
- }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
-}
-
# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
DATACITE_UNKNOWN_MARKERS: List[str] = [
"(:unac)", # temporarily inaccessible
@@ -180,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
}
]
-# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP: Dict[str, str] = {
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
- "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
- "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
- "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
- "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
- "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
- "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
- "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
- "//www.karger.com/Services/SiteLicenses/": "KARGER",
- "//www.springer.com/tdm/": "SPRINGER-TDM",
- "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-}
-
class DataciteImporter(EntityImporter):
"""
@@ -406,8 +275,8 @@ class DataciteImporter(EntityImporter):
container_name = None
container = attributes.get("container", {}) or {}
- if container.get("type") in CONTAINER_TYPE_MAP.keys():
- container_type = CONTAINER_TYPE_MAP.get(container["type"])
+ if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys():
+ container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])
if container.get("identifier") and container.get("identifierType") == "ISSN":
issn = container.get("identifier")
if issn and len(issn) == 8:
@@ -488,7 +357,7 @@ class DataciteImporter(EntityImporter):
license_extra = []
for lic in attributes.get("rightsList", []):
- slug = lookup_license_slug(lic.get("rightsUri"))
+ slug = datacite_lookup_license_slug(lic.get("rightsUri"))
if slug:
license_slug = slug
license_extra.append(lic)
@@ -968,7 +837,7 @@ def contributor_list_contains_contributor(
return False
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:
"""
Resolve a variety of strings into a some pseudo-canonical form, e.g.
CC-BY-ND, CC-0, MIT and so on.
@@ -1063,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
return None
return "RS-{}".format(name.upper())
- # Fallback to mapped values.
- raw = raw.lower()
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if not raw.endswith("/"):
- raw = raw + "/"
- return LICENSE_SLUG_MAP.get(raw)
+ # Fallback to generic license lookup
+ return lookup_license_slug(raw)
def find_original_language_title(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index f5c886a2..92dbe574 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter
from fatcat_tools.normal import (
clean_doi,
clean_orcid,
@@ -24,9 +24,6 @@ from fatcat_tools.normal import (
parse_month,
)
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
-
class DoajArticleImporter(EntityImporter):
def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 9db499a0..3c85132c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -9,9 +9,7 @@ from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
from fatcat_tools.normal import clean_doi, clean_str
-from .common import EntityImporter, make_rel_url
-
-MAX_ABSTRACT_BYTES = 4096
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url
class GrobidMetadataImporter(EntityImporter):
@@ -84,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):
extra_grobid: Dict[str, Any] = dict()
abstract = obj.get("abstract")
- if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
+ if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:
abobj = fatcat_openapi_client.ReleaseAbstract(
mimetype="text/plain", content=clean_str(obj.get("abstract"))
)
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index c2f650b0..79691c9a 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,7 +8,8 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC
+from fatcat_tools.normal import clean_doi, clean_str
from .common import EntityImporter
from .crossref import CONTAINER_TYPE_MAP
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3274234f..5bc7a9ff 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,325 +8,16 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.normal import (
+from fatcat_tools.biblio_lookup_tables import (
+ COUNTRY_NAME_MAP,
LANG_MAP_MARC,
- clean_doi,
- clean_issn,
- clean_pmcid,
- clean_pmid,
- clean_str,
+ MONTH_ABBR_MAP,
+ PUBMED_RELEASE_TYPE_MAP,
)
+from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str
from .common import EntityImporter
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
- # Adaptive Clinical Trial
- "Address": "speech",
- "Autobiography": "book",
- # Bibliography
- "Biography": "book",
- # Case Reports
- "Classical Article": "article-journal",
- # Clinical Conference
- # Clinical Study
- # Clinical Trial
- # Clinical Trial, Phase I
- # Clinical Trial, Phase II
- # Clinical Trial, Phase III
- # Clinical Trial, Phase IV
- # Clinical Trial Protocol
- # Clinical Trial, Veterinary
- # Collected Works
- # Comparative Study
- # Congress
- # Consensus Development Conference
- # Consensus Development Conference, NIH
- # Controlled Clinical Trial
- "Dataset": "dataset",
- # Dictionary
- # Directory
- # Duplicate Publication
- "Editorial": "editorial",
- # English Abstract # doesn't indicate that this is abstract-only
- # Equivalence Trial
- # Evaluation Studies
- # Expression of Concern
- # Festschrift
- # Government Document
- # Guideline
- "Historical Article": "article-journal",
- # Interactive Tutorial
- "Interview": "interview",
- "Introductory Journal Article": "article-journal",
- "Journal Article": "article-journal",
- "Lecture": "speech",
- "Legal Case": "legal_case",
- "Legislation": "legislation",
- "Letter": "letter",
- # Meta-Analysis
- # Multicenter Study
- # News
- "Newspaper Article": "article-newspaper",
- # Observational Study
- # Observational Study, Veterinary
- # Overall
- # Patient Education Handout
- # Periodical Index
- # Personal Narrative
- # Portrait
- # Practice Guideline
- # Pragmatic Clinical Trial
- # Publication Components
- # Publication Formats
- # Publication Type Category
- # Randomized Controlled Trial
- # Research Support, American Recovery and Reinvestment Act
- # Research Support, N.I.H., Extramural
- # Research Support, N.I.H., Intramural
- # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
- # Research Support, U.S. Gov't, P.H.S.
- # Review # in the "literature review" sense, not "product review"
- # Scientific Integrity Review
- # Study Characteristics
- # Support of Research
- # Systematic Review
- "Technical Report": "report",
- # Twin Study
- # Validation Studies
- # Video-Audio Media
- # Webcasts
-}
-
-MONTH_ABBR_MAP = {
- "Jan": 1,
- "01": 1,
- "Feb": 2,
- "02": 2,
- "Mar": 3,
- "03": 3,
- "Apr": 4,
- "04": 4,
- "May": 5,
- "05": 5,
- "Jun": 6,
- "06": 6,
- "Jul": 7,
- "07": 7,
- "Aug": 8,
- "08": 8,
- "Sep": 9,
- "09": 9,
- "Oct": 10,
- "10": 10,
- "Nov": 11,
- "11": 11,
- "Dec": 12,
- "12": 12,
-}
-
-# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
-COUNTRY_NAME_MAP = {
- "Afghanistan": "af",
- "Albania": "al",
- "Algeria": "dz",
- "Andorra": "ad",
- "Angola": "ao",
- "Antigua and Barbuda": "ag",
- "Argentina": "ar",
- "Armenia": "am",
- "Australia": "au",
- "Austria": "at",
- "Azerbaijan": "az",
- "Bahamas": "bs",
- "Bahrain": "bh",
- "Bangladesh": "bd",
- "Barbados": "bb",
- "Belarus": "by",
- "Belgium": "be",
- "Belize": "bz",
- "Benin": "bj",
- "Bhutan": "bt",
- "Bolivia": "bo",
- "Bosnia and Herzegowina": "ba",
- "Botswana": "bw",
- "Brazil": "br",
- "Brunei Darussalam": "bn",
- "Bulgaria": "bg",
- "Burkina Faso": "bf",
- "Burundi": "bi",
- "Cambodia": "kh",
- "Cameroon": "cm",
- "Canada": "ca",
- "Cape Verde": "cv",
- "Central African Republic": "cf",
- "Chad": "td",
- "Chile": "cl",
- "China": "cn",
- "Colombia": "co",
- "Comoros": "km",
- "Congo, Democratic Republic": "cd",
- "Congo, People’s Republic": "cg",
- "Costa Rica": "cr",
- "Cote d'Ivoire": "ci",
- "Croatia (Local Name: Hrvatska)": "hr",
- "Cuba": "cu",
- "Cyprus": "cy",
- "Czech Republic": "cz",
- "Denmark": "dk",
- "Djibouti": "dj",
- "Dominica": "dm",
- "Dominican Republic": "do",
- "East Timor": "tl",
- "Ecuador": "ec",
- "El Salvador": "sv",
- "Equatorial Guinea": "gq",
- "Eritrea": "er",
- "Estonia": "ee",
- "Ethiopia": "et",
- "Fiji": "fj",
- "Finland": "fi",
- "France": "fr",
- "Gabon": "ga",
- "Gambia": "gm",
- "Georgia": "ge",
- "Germany": "de",
- "Ghana": "gh",
- "Greece": "gr",
- "Greenland": "gl",
- "Grenada": "gd",
- "Guatemala": "gt",
- "Guinea": "gn",
- "Guinea-Bissau": "gw",
- "Guyana": "gy",
- "Haiti": "ht",
- "Honduras": "hn",
- "Hong Kong": "hk",
- "Hungary": "hu",
- "Iceland": "is",
- "India": "in",
- "Indonesia": "id",
- "Iran": "ir",
- "Iraq": "iq",
- "Ireland": "ie",
- "Israel": "il",
- "Italy": "it",
- "Jamaica": "jm",
- "Japan": "jp",
- "Jordan": "jo",
- "Kazakhstan": "kz",
- "Kenya": "ke",
- "Kiribati": "ki",
- "Korea, Democratic People's Republic": "kp",
- "Korea, Republic": "kr",
- "Kuwait": "kw",
- "Kyrgyzstan": "kg",
- "Laos": "la",
- "Latvia": "lv",
- "Lebanon": "lb",
- "Lesotho": "ls",
- "Liberia": "lr",
- "Libya": "ly",
- "Liechtenstein": "li",
- "Lithuania": "lt",
- "Luxembourg": "lu",
- "Macedonia": "mk",
- "Madagascar": "mg",
- "Malawi": "mw",
- "Malaysia": "my",
- "Maldives": "mv",
- "Mali": "ml",
- "Malta": "mt",
- "Marshall Islands": "mh",
- "Mauritania": "mr",
- "Mauritius": "mu",
- "Mexico": "mx",
- "Micronesia": "fm",
- "Moldova": "md",
- "Monaco": "mc",
- "Mongolia": "mn",
- "Morocco": "ma",
- "Mozambique": "mz",
- "Myanmar": "mm",
- "Namibia": "na",
- "Nauru": "nr",
- "Nepal": "np",
- "Netherlands": "nl",
- "New Zealand": "nz",
- "Nicaragua": "ni",
- "Niger": "ne",
- "Nigeria": "ng",
- "Norway": "no",
- "Oman": "om",
- "Pakistan": "pk",
- "Palau": "pw",
- "Panama": "pa",
- "Papua New Guinea": "pg",
- "Paraguay": "py",
- "Peru": "pe",
- "Philippines": "ph",
- "Poland": "pl",
- "Portugal": "pt",
- "Puerto Rico": "pr",
- "Qatar": "qa",
- "Romania": "ro",
- "Russian Federation": "ru",
- "Rwanda": "rw",
- "Saint Kitts and Nevis": "kn",
- "Saint Lucia": "lc",
- "Saint Vincent and the Grenadines": "vc",
- "Samoa": "ws",
- "San Marino": "sm",
- "Sao Tome and Príncipe": "st",
- "Saudi Arabia": "sa",
- "Senegal": "sn",
- "Serbia and Montenegro": "cs",
- "Seychelles": "sc",
- "Sierra Leone": "sl",
- "Singapore": "sg",
- "Slovakia (Slovak Republic)": "sk",
- "Slovenia": "si",
- "Solomon Islands": "sb",
- "Somalia": "so",
- "South Africa": "za",
- "Spain": "es",
- "Sri Lanka": "lk",
- "Sudan": "sd",
- "Suriname": "sr",
- "Swaziland": "sz",
- "Sweden": "se",
- "Switzerland": "ch",
- "Syrian Arab Republic": "sy",
- "Taiwan": "tw",
- "Tajikistan": "tj",
- "Tanzania": "tz",
- "Tanzania": "tz",
- "Thailand": "th",
- "Togo": "tg",
- "Tonga": "to",
- "Trinidad and Tobago": "tt",
- "Tunisia": "tn",
- "Turkey": "tr",
- "Turkmenistan": "tm",
- "Tuvalu": "tv",
- "Uganda": "ug",
- "Ukraine": "ua",
- "United Arab Emirates": "ae",
- "United Kingdom": "gb",
- "United States": "us",
- "Uruguay": "uy",
- # Additions from running over large files
- "Bosnia and Herzegovina": "ba",
- # "International"
- "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
- "Russia (Federation)": "ru",
- "Scotland": "gb",
- "England": "gb",
- "Korea (South)": "kr",
- "Georgia (Republic)": "ge",
- "Egypt": "eg",
-}
-
class PubmedImporter(EntityImporter):
"""
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 0d2c84ce..fc80411c 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -12,6 +12,8 @@ import ftfy
import langdetect
import pycountry
+from .biblio_lookup_tables import LICENSE_SLUG_MAP
+
DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
@@ -606,84 +608,35 @@ def test_parse_country_name() -> None:
assert parse_country_name("Japan") == "jp"
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
- "afr": "af",
- "alb": "sq",
- "amh": "am",
- "ara": "ar",
- "arm": "hy",
- "aze": "az",
- "ben": "bn",
- "bos": "bs",
- "bul": "bg",
- "cat": "ca",
- "chi": "zh",
- "cze": "cs",
- "dan": "da",
- "dut": "nl",
- "eng": "en",
- "epo": "eo",
- "est": "et",
- "fin": "fi",
- "fre": "fr",
- "geo": "ka",
- "ger": "de",
- "gla": "gd",
- "gre": "el",
- "heb": "he",
- "hin": "hi",
- "hrv": "hr",
- "hun": "hu",
- "ice": "is",
- "ind": "id",
- "ita": "it",
- "jpn": "ja",
- "kin": "rw",
- "kor": "ko",
- "lat": "la",
- "lav": "lv",
- "lit": "lt",
- "mac": "mk",
- "mal": "ml",
- "mao": "mi",
- "may": "ms",
- "nor": "no",
- "per": "fa",
- "per": "fa",
- "pol": "pl",
- "por": "pt",
- "pus": "ps",
- "rum": "ro",
- "rus": "ru",
- "san": "sa",
- "slo": "sk",
- "slv": "sl",
- "spa": "es",
- "srp": "sr",
- "swe": "sv",
- "tha": "th",
- "tur": "tr",
- "ukr": "uk",
- "urd": "ur",
- "vie": "vi",
- "wel": "cy",
- # additions
- "gle": "ga", # "Irish" (Gaelic)
- "jav": "jv", # Javanese
- "welsh": "cy", # Welsh
- "oci": "oc", # Occitan
- # Don't have ISO 639-1 codes
- "grc": "el", # Ancient Greek; map to modern greek
- "map": None, # Austronesian (collection)
- "syr": None, # Syriac, Modern
- "gem": None, # Old Saxon
- "non": None, # Old Norse
- "emg": None, # Eastern Meohang
- "neg": None, # Negidal
- "mul": None, # Multiple languages
- "und": None, # Undetermined
-}
+def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+ if not raw:
+ return None
+ raw = raw.strip().replace("http://", "//").replace("https://", "//")
+ if "creativecommons.org" in raw.lower():
+ raw = raw.lower()
+ raw = raw.replace("/legalcode", "/").replace("/uk", "")
+ if not raw.endswith("/"):
+ raw = raw + "/"
+ return LICENSE_SLUG_MAP.get(raw)
+
+
+def test_lookup_license_slug() -> None:
+
+ assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
+ assert (
+ lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+ == "CC-BY"
+ )
+ assert (
+ lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+ == "CC-0"
+ )
+ assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
+ assert (
+ lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+ == "CC-BY-NC-SA"
+ )
+ assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
+ assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
+ assert lookup_license_slug("") is None
+ assert lookup_license_slug(None) is None