summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/pubmed.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/pubmed.py')
-rw-r--r--python/fatcat_tools/importers/pubmed.py324
1 files changed, 11 insertions, 313 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 1cdb450b..a6c7409d 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,317 +8,15 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid
-
-from .common import LANG_MAP_MARC, EntityImporter, clean
-
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
- # Adaptive Clinical Trial
- "Address": "speech",
- "Autobiography": "book",
- # Bibliography
- "Biography": "book",
- # Case Reports
- "Classical Article": "article-journal",
- # Clinical Conference
- # Clinical Study
- # Clinical Trial
- # Clinical Trial, Phase I
- # Clinical Trial, Phase II
- # Clinical Trial, Phase III
- # Clinical Trial, Phase IV
- # Clinical Trial Protocol
- # Clinical Trial, Veterinary
- # Collected Works
- # Comparative Study
- # Congress
- # Consensus Development Conference
- # Consensus Development Conference, NIH
- # Controlled Clinical Trial
- "Dataset": "dataset",
- # Dictionary
- # Directory
- # Duplicate Publication
- "Editorial": "editorial",
- # English Abstract # doesn't indicate that this is abstract-only
- # Equivalence Trial
- # Evaluation Studies
- # Expression of Concern
- # Festschrift
- # Government Document
- # Guideline
- "Historical Article": "article-journal",
- # Interactive Tutorial
- "Interview": "interview",
- "Introductory Journal Article": "article-journal",
- "Journal Article": "article-journal",
- "Lecture": "speech",
- "Legal Case": "legal_case",
- "Legislation": "legislation",
- "Letter": "letter",
- # Meta-Analysis
- # Multicenter Study
- # News
- "Newspaper Article": "article-newspaper",
- # Observational Study
- # Observational Study, Veterinary
- # Overall
- # Patient Education Handout
- # Periodical Index
- # Personal Narrative
- # Portrait
- # Practice Guideline
- # Pragmatic Clinical Trial
- # Publication Components
- # Publication Formats
- # Publication Type Category
- # Randomized Controlled Trial
- # Research Support, American Recovery and Reinvestment Act
- # Research Support, N.I.H., Extramural
- # Research Support, N.I.H., Intramural
- # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
- # Research Support, U.S. Gov't, P.H.S.
- # Review # in the "literature review" sense, not "product review"
- # Scientific Integrity Review
- # Study Characteristics
- # Support of Research
- # Systematic Review
- "Technical Report": "report",
- # Twin Study
- # Validation Studies
- # Video-Audio Media
- # Webcasts
-}
-
-MONTH_ABBR_MAP = {
- "Jan": 1,
- "01": 1,
- "Feb": 2,
- "02": 2,
- "Mar": 3,
- "03": 3,
- "Apr": 4,
- "04": 4,
- "May": 5,
- "05": 5,
- "Jun": 6,
- "06": 6,
- "Jul": 7,
- "07": 7,
- "Aug": 8,
- "08": 8,
- "Sep": 9,
- "09": 9,
- "Oct": 10,
- "10": 10,
- "Nov": 11,
- "11": 11,
- "Dec": 12,
- "12": 12,
-}
-
-# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
-COUNTRY_NAME_MAP = {
- "Afghanistan": "af",
- "Albania": "al",
- "Algeria": "dz",
- "Andorra": "ad",
- "Angola": "ao",
- "Antigua and Barbuda": "ag",
- "Argentina": "ar",
- "Armenia": "am",
- "Australia": "au",
- "Austria": "at",
- "Azerbaijan": "az",
- "Bahamas": "bs",
- "Bahrain": "bh",
- "Bangladesh": "bd",
- "Barbados": "bb",
- "Belarus": "by",
- "Belgium": "be",
- "Belize": "bz",
- "Benin": "bj",
- "Bhutan": "bt",
- "Bolivia": "bo",
- "Bosnia and Herzegowina": "ba",
- "Botswana": "bw",
- "Brazil": "br",
- "Brunei Darussalam": "bn",
- "Bulgaria": "bg",
- "Burkina Faso": "bf",
- "Burundi": "bi",
- "Cambodia": "kh",
- "Cameroon": "cm",
- "Canada": "ca",
- "Cape Verde": "cv",
- "Central African Republic": "cf",
- "Chad": "td",
- "Chile": "cl",
- "China": "cn",
- "Colombia": "co",
- "Comoros": "km",
- "Congo, Democratic Republic": "cd",
- "Congo, People’s Republic": "cg",
- "Costa Rica": "cr",
- "Cote d'Ivoire": "ci",
- "Croatia (Local Name: Hrvatska)": "hr",
- "Cuba": "cu",
- "Cyprus": "cy",
- "Czech Republic": "cz",
- "Denmark": "dk",
- "Djibouti": "dj",
- "Dominica": "dm",
- "Dominican Republic": "do",
- "East Timor": "tl",
- "Ecuador": "ec",
- "El Salvador": "sv",
- "Equatorial Guinea": "gq",
- "Eritrea": "er",
- "Estonia": "ee",
- "Ethiopia": "et",
- "Fiji": "fj",
- "Finland": "fi",
- "France": "fr",
- "Gabon": "ga",
- "Gambia": "gm",
- "Georgia": "ge",
- "Germany": "de",
- "Ghana": "gh",
- "Greece": "gr",
- "Greenland": "gl",
- "Grenada": "gd",
- "Guatemala": "gt",
- "Guinea": "gn",
- "Guinea-Bissau": "gw",
- "Guyana": "gy",
- "Haiti": "ht",
- "Honduras": "hn",
- "Hong Kong": "hk",
- "Hungary": "hu",
- "Iceland": "is",
- "India": "in",
- "Indonesia": "id",
- "Iran": "ir",
- "Iraq": "iq",
- "Ireland": "ie",
- "Israel": "il",
- "Italy": "it",
- "Jamaica": "jm",
- "Japan": "jp",
- "Jordan": "jo",
- "Kazakhstan": "kz",
- "Kenya": "ke",
- "Kiribati": "ki",
- "Korea, Democratic People's Republic": "kp",
- "Korea, Republic": "kr",
- "Kuwait": "kw",
- "Kyrgyzstan": "kg",
- "Laos": "la",
- "Latvia": "lv",
- "Lebanon": "lb",
- "Lesotho": "ls",
- "Liberia": "lr",
- "Libya": "ly",
- "Liechtenstein": "li",
- "Lithuania": "lt",
- "Luxembourg": "lu",
- "Macedonia": "mk",
- "Madagascar": "mg",
- "Malawi": "mw",
- "Malaysia": "my",
- "Maldives": "mv",
- "Mali": "ml",
- "Malta": "mt",
- "Marshall Islands": "mh",
- "Mauritania": "mr",
- "Mauritius": "mu",
- "Mexico": "mx",
- "Micronesia": "fm",
- "Moldova": "md",
- "Monaco": "mc",
- "Mongolia": "mn",
- "Morocco": "ma",
- "Mozambique": "mz",
- "Myanmar": "mm",
- "Namibia": "na",
- "Nauru": "nr",
- "Nepal": "np",
- "Netherlands": "nl",
- "New Zealand": "nz",
- "Nicaragua": "ni",
- "Niger": "ne",
- "Nigeria": "ng",
- "Norway": "no",
- "Oman": "om",
- "Pakistan": "pk",
- "Palau": "pw",
- "Panama": "pa",
- "Papua New Guinea": "pg",
- "Paraguay": "py",
- "Peru": "pe",
- "Philippines": "ph",
- "Poland": "pl",
- "Portugal": "pt",
- "Puerto Rico": "pr",
- "Qatar": "qa",
- "Romania": "ro",
- "Russian Federation": "ru",
- "Rwanda": "rw",
- "Saint Kitts and Nevis": "kn",
- "Saint Lucia": "lc",
- "Saint Vincent and the Grenadines": "vc",
- "Samoa": "ws",
- "San Marino": "sm",
- "Sao Tome and Príncipe": "st",
- "Saudi Arabia": "sa",
- "Senegal": "sn",
- "Serbia and Montenegro": "cs",
- "Seychelles": "sc",
- "Sierra Leone": "sl",
- "Singapore": "sg",
- "Slovakia (Slovak Republic)": "sk",
- "Slovenia": "si",
- "Solomon Islands": "sb",
- "Somalia": "so",
- "South Africa": "za",
- "Spain": "es",
- "Sri Lanka": "lk",
- "Sudan": "sd",
- "Suriname": "sr",
- "Swaziland": "sz",
- "Sweden": "se",
- "Switzerland": "ch",
- "Syrian Arab Republic": "sy",
- "Taiwan": "tw",
- "Tajikistan": "tj",
- "Tanzania": "tz",
- "Tanzania": "tz",
- "Thailand": "th",
- "Togo": "tg",
- "Tonga": "to",
- "Trinidad and Tobago": "tt",
- "Tunisia": "tn",
- "Turkey": "tr",
- "Turkmenistan": "tm",
- "Tuvalu": "tv",
- "Uganda": "ug",
- "Ukraine": "ua",
- "United Arab Emirates": "ae",
- "United Kingdom": "gb",
- "United States": "us",
- "Uruguay": "uy",
- # Additions from running over large files
- "Bosnia and Herzegovina": "ba",
- # "International"
- "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
- "Russia (Federation)": "ru",
- "Scotland": "gb",
- "England": "gb",
- "Korea (South)": "kr",
- "Georgia (Republic)": "ge",
- "Egypt": "eg",
-}
+from fatcat_tools.biblio_lookup_tables import (
+ COUNTRY_NAME_MAP,
+ LANG_MAP_MARC,
+ MONTH_ABBR_MAP,
+ PUBMED_RELEASE_TYPE_MAP,
+)
+from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str
+
+from .common import EntityImporter
class PubmedImporter(EntityImporter):
@@ -704,14 +402,14 @@ class PubmedImporter(EntityImporter):
if extra_pubmed:
extra["pubmed"] = extra_pubmed
- title = clean(title)
+ title = clean_str(title)
if not title:
return None
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
title=title,
- original_title=clean(original_title),
+ original_title=clean_str(original_title),
release_type=release_type,
release_stage=release_stage,
release_date=release_date,