From ddc757bc1d5c610f42e9f5f10a4f060f517b66ca Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 10 Nov 2021 13:52:39 -0800 Subject: refactor importer metadata tables into separate file; move some helpers around - MAX_ABSTRACT_LENGTH set in a single place (importer common) - merge datacite license slug table in to common table, removing some TDM-specific licenses (which do not apply in the context of preserving the full work) --- python/fatcat_tools/importers/pubmed.py | 319 +------------------------------- 1 file changed, 5 insertions(+), 314 deletions(-) (limited to 'python/fatcat_tools/importers/pubmed.py') diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3274234f..5bc7a9ff 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,325 +8,16 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.normal import ( +from fatcat_tools.biblio_lookup_tables import ( + COUNTRY_NAME_MAP, LANG_MAP_MARC, - clean_doi, - clean_issn, - clean_pmcid, - clean_pmid, - clean_str, + MONTH_ABBR_MAP, + PUBMED_RELEASE_TYPE_MAP, ) +from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str from .common import EntityImporter -# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly -PUBMED_RELEASE_TYPE_MAP = { - # Adaptive Clinical Trial - "Address": "speech", - "Autobiography": "book", - # Bibliography - "Biography": "book", - # Case Reports - "Classical Article": "article-journal", - # Clinical Conference - # Clinical Study - # Clinical Trial - # Clinical Trial, Phase I - # Clinical Trial, Phase II - # Clinical Trial, Phase III - # Clinical Trial, Phase IV - # Clinical Trial Protocol - # Clinical Trial, Veterinary - # Collected Works - # Comparative Study - # Congress - # Consensus Development Conference - # Consensus Development Conference, NIH - # Controlled Clinical Trial - "Dataset": "dataset", - # Dictionary - # Directory - # Duplicate Publication - "Editorial": "editorial", - # English Abstract # doesn't indicate that this is abstract-only - # Equivalence Trial - # Evaluation Studies - # Expression of Concern - # Festschrift - # Government Document - # Guideline - "Historical Article": "article-journal", - # Interactive Tutorial - "Interview": "interview", - "Introductory Journal Article": "article-journal", - "Journal Article": "article-journal", - "Lecture": "speech", - "Legal Case": "legal_case", - "Legislation": "legislation", - "Letter": "letter", - # Meta-Analysis - # Multicenter Study - # News - "Newspaper Article": "article-newspaper", - # Observational Study - # Observational Study, Veterinary - # Overall - # Patient Education Handout - # Periodical Index - # Personal Narrative - # Portrait - # Practice Guideline - # Pragmatic Clinical Trial - # Publication Components - # Publication Formats - # Publication Type Category - # Randomized Controlled Trial - # Research Support, American Recovery and Reinvestment Act - # Research Support, N.I.H., Extramural - # Research Support, N.I.H., Intramural - # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. - # Research Support, U.S. Gov't, P.H.S. - # Review # in the "literature review" sense, not "product review" - # Scientific Integrity Review - # Study Characteristics - # Support of Research - # Systematic Review - "Technical Report": "report", - # Twin Study - # Validation Studies - # Video-Audio Media - # Webcasts -} - -MONTH_ABBR_MAP = { - "Jan": 1, - "01": 1, - "Feb": 2, - "02": 2, - "Mar": 3, - "03": 3, - "Apr": 4, - "04": 4, - "May": 5, - "05": 5, - "Jun": 6, - "06": 6, - "Jul": 7, - "07": 7, - "Aug": 8, - "08": 8, - "Sep": 9, - "09": 9, - "Oct": 10, - "10": 10, - "Nov": 11, - "11": 11, - "Dec": 12, - "12": 12, -} - -# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ -COUNTRY_NAME_MAP = { - "Afghanistan": "af", - "Albania": "al", - "Algeria": "dz", - "Andorra": "ad", - "Angola": "ao", - "Antigua and Barbuda": "ag", - "Argentina": "ar", - "Armenia": "am", - "Australia": "au", - "Austria": "at", - "Azerbaijan": "az", - "Bahamas": "bs", - "Bahrain": "bh", - "Bangladesh": "bd", - "Barbados": "bb", - "Belarus": "by", - "Belgium": "be", - "Belize": "bz", - "Benin": "bj", - "Bhutan": "bt", - "Bolivia": "bo", - "Bosnia and Herzegowina": "ba", - "Botswana": "bw", - "Brazil": "br", - "Brunei Darussalam": "bn", - "Bulgaria": "bg", - "Burkina Faso": "bf", - "Burundi": "bi", - "Cambodia": "kh", - "Cameroon": "cm", - "Canada": "ca", - "Cape Verde": "cv", - "Central African Republic": "cf", - "Chad": "td", - "Chile": "cl", - "China": "cn", - "Colombia": "co", - "Comoros": "km", - "Congo, Democratic Republic": "cd", - "Congo, People’s Republic": "cg", - "Costa Rica": "cr", - "Cote d'Ivoire": "ci", - "Croatia (Local Name: Hrvatska)": "hr", - "Cuba": "cu", - "Cyprus": "cy", - "Czech Republic": "cz", - "Denmark": "dk", - "Djibouti": "dj", - "Dominica": "dm", - "Dominican Republic": "do", - "East Timor": "tl", - "Ecuador": "ec", - "El Salvador": "sv", - "Equatorial Guinea": "gq", - "Eritrea": "er", - "Estonia": "ee", - "Ethiopia": "et", - "Fiji": "fj", - "Finland": "fi", - "France": "fr", - "Gabon": "ga", - "Gambia": "gm", - "Georgia": "ge", - "Germany": "de", - "Ghana": "gh", - "Greece": "gr", - "Greenland": "gl", - "Grenada": "gd", - "Guatemala": "gt", - "Guinea": "gn", - "Guinea-Bissau": "gw", - "Guyana": "gy", - "Haiti": "ht", - "Honduras": "hn", - "Hong Kong": "hk", - "Hungary": "hu", - "Iceland": "is", - "India": "in", - "Indonesia": "id", - "Iran": "ir", - "Iraq": "iq", - "Ireland": "ie", - "Israel": "il", - "Italy": "it", - "Jamaica": "jm", - "Japan": "jp", - "Jordan": "jo", - "Kazakhstan": "kz", - "Kenya": "ke", - "Kiribati": "ki", - "Korea, Democratic People's Republic": "kp", - "Korea, Republic": "kr", - "Kuwait": "kw", - "Kyrgyzstan": "kg", - "Laos": "la", - "Latvia": "lv", - "Lebanon": "lb", - "Lesotho": "ls", - "Liberia": "lr", - "Libya": "ly", - "Liechtenstein": "li", - "Lithuania": "lt", - "Luxembourg": "lu", - "Macedonia": "mk", - "Madagascar": "mg", - "Malawi": "mw", - "Malaysia": "my", - "Maldives": "mv", - "Mali": "ml", - "Malta": "mt", - "Marshall Islands": "mh", - "Mauritania": "mr", - "Mauritius": "mu", - "Mexico": "mx", - "Micronesia": "fm", - "Moldova": "md", - "Monaco": "mc", - "Mongolia": "mn", - "Morocco": "ma", - "Mozambique": "mz", - "Myanmar": "mm", - "Namibia": "na", - "Nauru": "nr", - "Nepal": "np", - "Netherlands": "nl", - "New Zealand": "nz", - "Nicaragua": "ni", - "Niger": "ne", - "Nigeria": "ng", - "Norway": "no", - "Oman": "om", - "Pakistan": "pk", - "Palau": "pw", - "Panama": "pa", - "Papua New Guinea": "pg", - "Paraguay": "py", - "Peru": "pe", - "Philippines": "ph", - "Poland": "pl", - "Portugal": "pt", - "Puerto Rico": "pr", - "Qatar": "qa", - "Romania": "ro", - "Russian Federation": "ru", - "Rwanda": "rw", - "Saint Kitts and Nevis": "kn", - "Saint Lucia": "lc", - "Saint Vincent and the Grenadines": "vc", - "Samoa": "ws", - "San Marino": "sm", - "Sao Tome and Príncipe": "st", - "Saudi Arabia": "sa", - "Senegal": "sn", - "Serbia and Montenegro": "cs", - "Seychelles": "sc", - "Sierra Leone": "sl", - "Singapore": "sg", - "Slovakia (Slovak Republic)": "sk", - "Slovenia": "si", - "Solomon Islands": "sb", - "Somalia": "so", - "South Africa": "za", - "Spain": "es", - "Sri Lanka": "lk", - "Sudan": "sd", - "Suriname": "sr", - "Swaziland": "sz", - "Sweden": "se", - "Switzerland": "ch", - "Syrian Arab Republic": "sy", - "Taiwan": "tw", - "Tajikistan": "tj", - "Tanzania": "tz", - "Tanzania": "tz", - "Thailand": "th", - "Togo": "tg", - "Tonga": "to", - "Trinidad and Tobago": "tt", - "Tunisia": "tn", - "Turkey": "tr", - "Turkmenistan": "tm", - "Tuvalu": "tv", - "Uganda": "ug", - "Ukraine": "ua", - "United Arab Emirates": "ae", - "United Kingdom": "gb", - "United States": "us", - "Uruguay": "uy", - # Additions from running over large files - "Bosnia and Herzegovina": "ba", - # "International" - "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn - "Russia (Federation)": "ru", - "Scotland": "gb", - "England": "gb", - "Korea (South)": "kr", - "Georgia (Republic)": "ge", - "Egypt": "eg", -} - class PubmedImporter(EntityImporter): """ -- cgit v1.2.3