summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py3
-rw-r--r--python/fatcat_tools/importers/common.py61
-rw-r--r--python/fatcat_tools/importers/crossref.py94
-rw-r--r--python/fatcat_tools/importers/datacite.py155
-rw-r--r--python/fatcat_tools/importers/doaj_article.py5
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py6
-rw-r--r--python/fatcat_tools/importers/jstor.py3
-rw-r--r--python/fatcat_tools/importers/pubmed.py319
8 files changed, 25 insertions, 621 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 4d4d696b..654be2e9 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -26,9 +26,8 @@ from .common import (
KafkaJsonPusher,
LinePusher,
SqlitePusher,
- make_kafka_consumer,
)
-from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
+from .crossref import CrossrefImporter
from .datacite import DataciteImporter
from .dblp_container import DblpContainerImporter
from .dblp_release import DblpReleaseImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 56c3d32e..7c587395 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,71 +27,14 @@ from fatcat_openapi_client import (
from fatcat_openapi_client.rest import ApiException
from fuzzycat.matching import match_release_fuzzy
+from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP
from fatcat_tools.normal import clean_doi
from fatcat_tools.transforms import entity_to_dict
DATE_FMT: str = "%Y-%m-%d"
SANE_MAX_RELEASES: int = 200
SANE_MAX_URLS: int = 100
-
-DOMAIN_REL_MAP: Dict[str, str] = {
- "archive.org": "archive",
- # LOCKSS, Portico, DuraSpace, etc would also be "archive"
- "arxiv.org": "repository",
- "babel.hathitrust.org": "repository",
- "cds.cern.ch": "repository",
- "deepblue.lib.umich.edu": "repository",
- "europepmc.org": "repository",
- "hal.inria.fr": "repository",
- "scielo.isciii.es": "repository",
- "www.dtic.mil": "repository",
- "www.jstage.jst.go.jp": "repository",
- "www.jstor.org": "repository",
- "www.ncbi.nlm.nih.gov": "repository",
- "ftp.ncbi.nlm.nih.gov": "repository",
- "www.scielo.br": "repository",
- "www.scielo.cl": "repository",
- "www.scielo.org.mx": "repository",
- "zenodo.org": "repository",
- "www.biorxiv.org": "repository",
- "www.medrxiv.org": "repository",
- "citeseerx.ist.psu.edu": "aggregator",
- "publisher-connector.core.ac.uk": "aggregator",
- "core.ac.uk": "aggregator",
- "static.aminer.org": "aggregator",
- "aminer.org": "aggregator",
- "pdfs.semanticscholar.org": "aggregator",
- "semanticscholar.org": "aggregator",
- "www.semanticscholar.org": "aggregator",
- "academic.oup.com": "publisher",
- "cdn.elifesciences.org": "publisher",
- "cell.com": "publisher",
- "dl.acm.org": "publisher",
- "downloads.hindawi.com": "publisher",
- "elifesciences.org": "publisher",
- "iopscience.iop.org": "publisher",
- "journals.plos.org": "publisher",
- "link.springer.com": "publisher",
- "onlinelibrary.wiley.com": "publisher",
- "works.bepress.com": "publisher",
- "www.biomedcentral.com": "publisher",
- "www.cell.com": "publisher",
- "www.nature.com": "publisher",
- "www.pnas.org": "publisher",
- "www.tandfonline.com": "publisher",
- "www.frontiersin.org": "publisher",
- "www.degruyter.com": "publisher",
- "www.mdpi.com": "publisher",
- "www.ahajournals.org": "publisher",
- "ehp.niehs.nih.gov": "publisher",
- "journals.tsu.ru": "publisher",
- "www.cogentoa.com": "publisher",
- "www.researchgate.net": "academicsocial",
- "academia.edu": "academicsocial",
- "wayback.archive-it.org": "webarchive",
- "web.archive.org": "webarchive",
- "archive.is": "webarchive",
-}
+MAX_ABSTRACT_LENGTH: int = 2048
def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 8f5a4265..52bd7465 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,7 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
from .common import EntityImporter
@@ -33,97 +34,6 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
"standard": "standard",
}
-CONTAINER_TYPE_MAP: Dict[str, str] = {
- "article-journal": "journal",
- "paper-conference": "conference",
- "book": "book-series",
-}
-
-# These are based, informally, on sorting the most popular licenses found in
-# Crossref metadata. There were over 500 unique strings and only a few most
-# popular are here; many were variants of the CC URLs. Would be useful to
-# normalize CC licenses better.
-# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP: Dict[str, str] = {
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//creativecommons.org/licenses/by/2.0/": "CC-BY",
- "//creativecommons.org/licenses/by/3.0/": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/": "CC-BY",
- "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.karger.com/Services/SiteLicenses": "KARGER",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
- "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
- "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
- # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
- # //www.springer.com/tdm doesn't seem like a license
- # //iopscience.iop.org/page/copyright is closed
- # //www.acm.org/publications/policies/copyright_policy#Background is closed
- # //rsc.li/journals-terms-of-use is closed for vor (am open)
- # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-}
-
-
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
- if not raw:
- return None
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if "creativecommons.org" in raw.lower():
- raw = raw.lower()
- raw = raw.replace("/legalcode", "/").replace("/uk", "")
- if not raw.endswith("/"):
- raw = raw + "/"
- return LICENSE_SLUG_MAP.get(raw)
-
-
-def test_lookup_license_slug() -> None:
-
- assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
- assert (
- lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
- == "CC-BY"
- )
- assert (
- lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
- == "CC-0"
- )
- assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
- assert (
- lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
- == "CC-BY-NC-SA"
- )
- assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
- assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
- assert lookup_license_slug("") is None
- assert lookup_license_slug(None) is None
-
class CrossrefImporter(EntityImporter):
"""
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 441514b8..b310f8bc 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,113 +21,19 @@ import langdetect
import pycountry
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
from fatcat_tools.transforms import entity_to_dict
-from .common import EntityImporter
-
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP: Dict[str, str] = {
+DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {
"Journal": "journal",
"Series": "journal",
"Book Series": "book-series",
}
-# The docs/guide should be the canonical home for these mappings; update there
-# first. Map various datacite type types to CSL-ish types. None means TODO or
-# remove.
-DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
- "ris": {
- "THES": "thesis",
- "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report)
- "CHAP": "chapter",
- "FIGURE": "figure",
- "RPRT": "report",
- "JOUR": "article-journal",
- "MPCT": "motion_picture",
- "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset
- "BOOK": "book",
- "DATA": "dataset",
- "COMP": "software",
- },
- "schemaOrg": {
- "Dataset": "dataset",
- "Book": "book",
- "ScholarlyArticle": "article-journal",
- "ImageObject": "graphic",
- "Collection": None,
- "MediaObject": None,
- "Event": None,
- "SoftwareSourceCode": "software",
- "Chapter": "chapter",
- "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
- "PublicationIssue": "article",
- "AudioObject": None,
- "Thesis": "thesis",
- },
- "citeproc": {
- "article": "article",
- "article-journal": "article-journal",
- "article-magazine": "article-magazine",
- "article-newspaper": "article-newspaper",
- "bill": "bill",
- "book": "book",
- "broadcast": "broadcast",
- "chapter": "chapter",
- "dataset": "dataset",
- "entry-dictionary": "entry-dictionary",
- "entry-encyclopedia": "entry-encyclopedia",
- "entry": "entry",
- "figure": "figure",
- "graphic": "graphic",
- "interview": "interview",
- "legal_case": "legal_case",
- "legislation": "legislation",
- "manuscript": "manuscript",
- "map": "map",
- "motion_picture": "motion_picture",
- "musical_score": "musical_score",
- "pamphlet": "pamphlet",
- "paper-conference": "paper-conference",
- "patent": "patent",
- "personal_communication": "personal_communication",
- "post": "post",
- "post-weblog": "post-weblog",
- "report": "report",
- "review-book": "review-book",
- "review": "review",
- "song": "song",
- "speech": "speech",
- "thesis": "thesis",
- "treaty": "treaty",
- "webpage": "webpage",
- }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
- "bibtex": {
- "phdthesis": "thesis",
- "inbook": "chapter",
- "misc": None,
- "article": "article-journal",
- "book": "book",
- },
- "resourceTypeGeneral": {
- "Image": "graphic",
- "Dataset": "dataset",
- "PhysicalObject": None,
- "Collection": None,
- "Text": None, # "Greyliterature, labnotes, accompanyingmaterials"
- "Sound": None,
- "InteractiveResource": None,
- "Event": None,
- "Software": "software",
- "Other": None,
- "Workflow": None,
- "Audiovisual": None,
- }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
-}
-
# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
DATACITE_UNKNOWN_MARKERS: List[str] = [
"(:unac)", # temporarily inaccessible
@@ -180,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
}
]
-# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP: Dict[str, str] = {
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
- "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
- "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
- "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
- "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
- "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
- "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
- "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
- "//www.karger.com/Services/SiteLicenses/": "KARGER",
- "//www.springer.com/tdm/": "SPRINGER-TDM",
- "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-}
-
class DataciteImporter(EntityImporter):
"""
@@ -406,8 +275,8 @@ class DataciteImporter(EntityImporter):
container_name = None
container = attributes.get("container", {}) or {}
- if container.get("type") in CONTAINER_TYPE_MAP.keys():
- container_type = CONTAINER_TYPE_MAP.get(container["type"])
+ if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys():
+ container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])
if container.get("identifier") and container.get("identifierType") == "ISSN":
issn = container.get("identifier")
if issn and len(issn) == 8:
@@ -488,7 +357,7 @@ class DataciteImporter(EntityImporter):
license_extra = []
for lic in attributes.get("rightsList", []):
- slug = lookup_license_slug(lic.get("rightsUri"))
+ slug = datacite_lookup_license_slug(lic.get("rightsUri"))
if slug:
license_slug = slug
license_extra.append(lic)
@@ -968,7 +837,7 @@ def contributor_list_contains_contributor(
return False
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:
"""
Resolve a variety of strings into a some pseudo-canonical form, e.g.
CC-BY-ND, CC-0, MIT and so on.
@@ -1063,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
return None
return "RS-{}".format(name.upper())
- # Fallback to mapped values.
- raw = raw.lower()
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if not raw.endswith("/"):
- raw = raw + "/"
- return LICENSE_SLUG_MAP.get(raw)
+ # Fallback to generic license lookup
+ return lookup_license_slug(raw)
def find_original_language_title(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index f5c886a2..92dbe574 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter
from fatcat_tools.normal import (
clean_doi,
clean_orcid,
@@ -24,9 +24,6 @@ from fatcat_tools.normal import (
parse_month,
)
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
-
class DoajArticleImporter(EntityImporter):
def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 9db499a0..3c85132c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -9,9 +9,7 @@ from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
from fatcat_tools.normal import clean_doi, clean_str
-from .common import EntityImporter, make_rel_url
-
-MAX_ABSTRACT_BYTES = 4096
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url
class GrobidMetadataImporter(EntityImporter):
@@ -84,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):
extra_grobid: Dict[str, Any] = dict()
abstract = obj.get("abstract")
- if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
+ if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:
abobj = fatcat_openapi_client.ReleaseAbstract(
mimetype="text/plain", content=clean_str(obj.get("abstract"))
)
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index c2f650b0..79691c9a 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,7 +8,8 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC
+from fatcat_tools.normal import clean_doi, clean_str
from .common import EntityImporter
from .crossref import CONTAINER_TYPE_MAP
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3274234f..5bc7a9ff 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,325 +8,16 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.normal import (
+from fatcat_tools.biblio_lookup_tables import (
+ COUNTRY_NAME_MAP,
LANG_MAP_MARC,
- clean_doi,
- clean_issn,
- clean_pmcid,
- clean_pmid,
- clean_str,
+ MONTH_ABBR_MAP,
+ PUBMED_RELEASE_TYPE_MAP,
)
+from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str
from .common import EntityImporter
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
- # Adaptive Clinical Trial
- "Address": "speech",
- "Autobiography": "book",
- # Bibliography
- "Biography": "book",
- # Case Reports
- "Classical Article": "article-journal",
- # Clinical Conference
- # Clinical Study
- # Clinical Trial
- # Clinical Trial, Phase I
- # Clinical Trial, Phase II
- # Clinical Trial, Phase III
- # Clinical Trial, Phase IV
- # Clinical Trial Protocol
- # Clinical Trial, Veterinary
- # Collected Works
- # Comparative Study
- # Congress
- # Consensus Development Conference
- # Consensus Development Conference, NIH
- # Controlled Clinical Trial
- "Dataset": "dataset",
- # Dictionary
- # Directory
- # Duplicate Publication
- "Editorial": "editorial",
- # English Abstract # doesn't indicate that this is abstract-only
- # Equivalence Trial
- # Evaluation Studies
- # Expression of Concern
- # Festschrift
- # Government Document
- # Guideline
- "Historical Article": "article-journal",
- # Interactive Tutorial
- "Interview": "interview",
- "Introductory Journal Article": "article-journal",
- "Journal Article": "article-journal",
- "Lecture": "speech",
- "Legal Case": "legal_case",
- "Legislation": "legislation",
- "Letter": "letter",
- # Meta-Analysis
- # Multicenter Study
- # News
- "Newspaper Article": "article-newspaper",
- # Observational Study
- # Observational Study, Veterinary
- # Overall
- # Patient Education Handout
- # Periodical Index
- # Personal Narrative
- # Portrait
- # Practice Guideline
- # Pragmatic Clinical Trial
- # Publication Components
- # Publication Formats
- # Publication Type Category
- # Randomized Controlled Trial
- # Research Support, American Recovery and Reinvestment Act
- # Research Support, N.I.H., Extramural
- # Research Support, N.I.H., Intramural
- # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
- # Research Support, U.S. Gov't, P.H.S.
- # Review # in the "literature review" sense, not "product review"
- # Scientific Integrity Review
- # Study Characteristics
- # Support of Research
- # Systematic Review
- "Technical Report": "report",
- # Twin Study
- # Validation Studies
- # Video-Audio Media
- # Webcasts
-}
-
-MONTH_ABBR_MAP = {
- "Jan": 1,
- "01": 1,
- "Feb": 2,
- "02": 2,
- "Mar": 3,
- "03": 3,
- "Apr": 4,
- "04": 4,
- "May": 5,
- "05": 5,
- "Jun": 6,
- "06": 6,
- "Jul": 7,
- "07": 7,
- "Aug": 8,
- "08": 8,
- "Sep": 9,
- "09": 9,
- "Oct": 10,
- "10": 10,
- "Nov": 11,
- "11": 11,
- "Dec": 12,
- "12": 12,
-}
-
-# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
-COUNTRY_NAME_MAP = {
- "Afghanistan": "af",
- "Albania": "al",
- "Algeria": "dz",
- "Andorra": "ad",
- "Angola": "ao",
- "Antigua and Barbuda": "ag",
- "Argentina": "ar",
- "Armenia": "am",
- "Australia": "au",
- "Austria": "at",
- "Azerbaijan": "az",
- "Bahamas": "bs",
- "Bahrain": "bh",
- "Bangladesh": "bd",
- "Barbados": "bb",
- "Belarus": "by",
- "Belgium": "be",
- "Belize": "bz",
- "Benin": "bj",
- "Bhutan": "bt",
- "Bolivia": "bo",
- "Bosnia and Herzegowina": "ba",
- "Botswana": "bw",
- "Brazil": "br",
- "Brunei Darussalam": "bn",
- "Bulgaria": "bg",
- "Burkina Faso": "bf",
- "Burundi": "bi",
- "Cambodia": "kh",
- "Cameroon": "cm",
- "Canada": "ca",
- "Cape Verde": "cv",
- "Central African Republic": "cf",
- "Chad": "td",
- "Chile": "cl",
- "China": "cn",
- "Colombia": "co",
- "Comoros": "km",
- "Congo, Democratic Republic": "cd",
- "Congo, People’s Republic": "cg",
- "Costa Rica": "cr",
- "Cote d'Ivoire": "ci",
- "Croatia (Local Name: Hrvatska)": "hr",
- "Cuba": "cu",
- "Cyprus": "cy",
- "Czech Republic": "cz",
- "Denmark": "dk",
- "Djibouti": "dj",
- "Dominica": "dm",
- "Dominican Republic": "do",
- "East Timor": "tl",
- "Ecuador": "ec",
- "El Salvador": "sv",
- "Equatorial Guinea": "gq",
- "Eritrea": "er",
- "Estonia": "ee",
- "Ethiopia": "et",
- "Fiji": "fj",
- "Finland": "fi",
- "France": "fr",
- "Gabon": "ga",
- "Gambia": "gm",
- "Georgia": "ge",
- "Germany": "de",
- "Ghana": "gh",
- "Greece": "gr",
- "Greenland": "gl",
- "Grenada": "gd",
- "Guatemala": "gt",
- "Guinea": "gn",
- "Guinea-Bissau": "gw",
- "Guyana": "gy",
- "Haiti": "ht",
- "Honduras": "hn",
- "Hong Kong": "hk",
- "Hungary": "hu",
- "Iceland": "is",
- "India": "in",
- "Indonesia": "id",
- "Iran": "ir",
- "Iraq": "iq",
- "Ireland": "ie",
- "Israel": "il",
- "Italy": "it",
- "Jamaica": "jm",
- "Japan": "jp",
- "Jordan": "jo",
- "Kazakhstan": "kz",
- "Kenya": "ke",
- "Kiribati": "ki",
- "Korea, Democratic People's Republic": "kp",
- "Korea, Republic": "kr",
- "Kuwait": "kw",
- "Kyrgyzstan": "kg",
- "Laos": "la",
- "Latvia": "lv",
- "Lebanon": "lb",
- "Lesotho": "ls",
- "Liberia": "lr",
- "Libya": "ly",
- "Liechtenstein": "li",
- "Lithuania": "lt",
- "Luxembourg": "lu",
- "Macedonia": "mk",
- "Madagascar": "mg",
- "Malawi": "mw",
- "Malaysia": "my",
- "Maldives": "mv",
- "Mali": "ml",
- "Malta": "mt",
- "Marshall Islands": "mh",
- "Mauritania": "mr",
- "Mauritius": "mu",
- "Mexico": "mx",
- "Micronesia": "fm",
- "Moldova": "md",
- "Monaco": "mc",
- "Mongolia": "mn",
- "Morocco": "ma",
- "Mozambique": "mz",
- "Myanmar": "mm",
- "Namibia": "na",
- "Nauru": "nr",
- "Nepal": "np",
- "Netherlands": "nl",
- "New Zealand": "nz",
- "Nicaragua": "ni",
- "Niger": "ne",
- "Nigeria": "ng",
- "Norway": "no",
- "Oman": "om",
- "Pakistan": "pk",
- "Palau": "pw",
- "Panama": "pa",
- "Papua New Guinea": "pg",
- "Paraguay": "py",
- "Peru": "pe",
- "Philippines": "ph",
- "Poland": "pl",
- "Portugal": "pt",
- "Puerto Rico": "pr",
- "Qatar": "qa",
- "Romania": "ro",
- "Russian Federation": "ru",
- "Rwanda": "rw",
- "Saint Kitts and Nevis": "kn",
- "Saint Lucia": "lc",
- "Saint Vincent and the Grenadines": "vc",
- "Samoa": "ws",
- "San Marino": "sm",
- "Sao Tome and Príncipe": "st",
- "Saudi Arabia": "sa",
- "Senegal": "sn",
- "Serbia and Montenegro": "cs",
- "Seychelles": "sc",
- "Sierra Leone": "sl",
- "Singapore": "sg",
- "Slovakia (Slovak Republic)": "sk",
- "Slovenia": "si",
- "Solomon Islands": "sb",
- "Somalia": "so",
- "South Africa": "za",
- "Spain": "es",
- "Sri Lanka": "lk",
- "Sudan": "sd",
- "Suriname": "sr",
- "Swaziland": "sz",
- "Sweden": "se",
- "Switzerland": "ch",
- "Syrian Arab Republic": "sy",
- "Taiwan": "tw",
- "Tajikistan": "tj",
- "Tanzania": "tz",
- "Tanzania": "tz",
- "Thailand": "th",
- "Togo": "tg",
- "Tonga": "to",
- "Trinidad and Tobago": "tt",
- "Tunisia": "tn",
- "Turkey": "tr",
- "Turkmenistan": "tm",
- "Tuvalu": "tv",
- "Uganda": "ug",
- "Ukraine": "ua",
- "United Arab Emirates": "ae",
- "United Kingdom": "gb",
- "United States": "us",
- "Uruguay": "uy",
- # Additions from running over large files
- "Bosnia and Herzegovina": "ba",
- # "International"
- "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
- "Russia (Federation)": "ru",
- "Scotland": "gb",
- "England": "gb",
- "Korea (South)": "kr",
- "Georgia (Republic)": "ge",
- "Egypt": "eg",
-}
-
class PubmedImporter(EntityImporter):
"""