From ddc757bc1d5c610f42e9f5f10a4f060f517b66ca Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 10 Nov 2021 13:52:39 -0800 Subject: refactor importer metadata tables into separate file; move some helpers around - MAX_ABSTRACT_LENGTH set in a single place (importer common) - merge datacite license slug table in to common table, removing some TDM-specific licenses (which do not apply in the context of preserving the full work) --- python/fatcat_tools/importers/__init__.py | 3 +- python/fatcat_tools/importers/common.py | 61 +---- python/fatcat_tools/importers/crossref.py | 94 +------ python/fatcat_tools/importers/datacite.py | 155 +---------- python/fatcat_tools/importers/doaj_article.py | 5 +- python/fatcat_tools/importers/grobid_metadata.py | 6 +- python/fatcat_tools/importers/jstor.py | 3 +- python/fatcat_tools/importers/pubmed.py | 319 +---------------------- 8 files changed, 25 insertions(+), 621 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 4d4d696b..654be2e9 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -26,9 +26,8 @@ from .common import ( KafkaJsonPusher, LinePusher, SqlitePusher, - make_kafka_consumer, ) -from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug +from .crossref import CrossrefImporter from .datacite import DataciteImporter from .dblp_container import DblpContainerImporter from .dblp_release import DblpReleaseImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 56c3d32e..7c587395 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -27,71 +27,14 @@ from fatcat_openapi_client import ( from fatcat_openapi_client.rest import ApiException from fuzzycat.matching import match_release_fuzzy +from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP from fatcat_tools.normal import clean_doi from fatcat_tools.transforms import entity_to_dict DATE_FMT: str = "%Y-%m-%d" SANE_MAX_RELEASES: int = 200 SANE_MAX_URLS: int = 100 - -DOMAIN_REL_MAP: Dict[str, str] = { - "archive.org": "archive", - # LOCKSS, Portico, DuraSpace, etc would also be "archive" - "arxiv.org": "repository", - "babel.hathitrust.org": "repository", - "cds.cern.ch": "repository", - "deepblue.lib.umich.edu": "repository", - "europepmc.org": "repository", - "hal.inria.fr": "repository", - "scielo.isciii.es": "repository", - "www.dtic.mil": "repository", - "www.jstage.jst.go.jp": "repository", - "www.jstor.org": "repository", - "www.ncbi.nlm.nih.gov": "repository", - "ftp.ncbi.nlm.nih.gov": "repository", - "www.scielo.br": "repository", - "www.scielo.cl": "repository", - "www.scielo.org.mx": "repository", - "zenodo.org": "repository", - "www.biorxiv.org": "repository", - "www.medrxiv.org": "repository", - "citeseerx.ist.psu.edu": "aggregator", - "publisher-connector.core.ac.uk": "aggregator", - "core.ac.uk": "aggregator", - "static.aminer.org": "aggregator", - "aminer.org": "aggregator", - "pdfs.semanticscholar.org": "aggregator", - "semanticscholar.org": "aggregator", - "www.semanticscholar.org": "aggregator", - "academic.oup.com": "publisher", - "cdn.elifesciences.org": "publisher", - "cell.com": "publisher", - "dl.acm.org": "publisher", - "downloads.hindawi.com": "publisher", - "elifesciences.org": "publisher", - "iopscience.iop.org": "publisher", - "journals.plos.org": "publisher", - "link.springer.com": "publisher", - "onlinelibrary.wiley.com": "publisher", - "works.bepress.com": "publisher", - "www.biomedcentral.com": "publisher", - "www.cell.com": "publisher", - "www.nature.com": "publisher", - "www.pnas.org": "publisher", - "www.tandfonline.com": "publisher", - "www.frontiersin.org": "publisher", - "www.degruyter.com": "publisher", - "www.mdpi.com": "publisher", - "www.ahajournals.org": "publisher", - "ehp.niehs.nih.gov": "publisher", - "journals.tsu.ru": "publisher", - "www.cogentoa.com": "publisher", - "www.researchgate.net": "academicsocial", - "academia.edu": "academicsocial", - "wayback.archive-it.org": "webarchive", - "web.archive.org": "webarchive", - "archive.is": "webarchive", -} +MAX_ABSTRACT_LENGTH: int = 2048 def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]: diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 8f5a4265..52bd7465 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -4,7 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi, clean_str +from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP +from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug from .common import EntityImporter @@ -33,97 +34,6 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = { "standard": "standard", } -CONTAINER_TYPE_MAP: Dict[str, str] = { - "article-journal": "journal", - "paper-conference": "conference", - "book": "book-series", -} - -# These are based, informally, on sorting the most popular licenses found in -# Crossref metadata. There were over 500 unique strings and only a few most -# popular are here; many were variants of the CC URLs. Would be useful to -# normalize CC licenses better. -# The current norm is to only add license slugs that are at least partially OA. -LICENSE_SLUG_MAP: Dict[str, str] = { - "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", - "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", - "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", - "//creativecommons.org/licenses/by/2.0/": "CC-BY", - "//creativecommons.org/licenses/by/3.0/": "CC-BY", - "//creativecommons.org/licenses/by/4.0/": "CC-BY", - "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", - "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", - "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", - "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", - "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", - "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", - "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", - "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", - "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", - "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", - "//spdx.org/licenses/CC0-1.0.json": "CC-0", - "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", - "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", - "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", - "//spdx.org/licenses/MIT.json": "MIT", - "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", - "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.karger.com/Services/SiteLicenses": "KARGER", - "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", - "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", - "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", - "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", - "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", - "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC", - # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license - # //www.springer.com/tdm doesn't seem like a license - # //iopscience.iop.org/page/copyright is closed - # //www.acm.org/publications/policies/copyright_policy#Background is closed - # //rsc.li/journals-terms-of-use is closed for vor (am open) - # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!) - "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", -} - - -def lookup_license_slug(raw: Optional[str]) -> Optional[str]: - if not raw: - return None - raw = raw.strip().replace("http://", "//").replace("https://", "//") - if "creativecommons.org" in raw.lower(): - raw = raw.lower() - raw = raw.replace("/legalcode", "/").replace("/uk", "") - if not raw.endswith("/"): - raw = raw + "/" - return LICENSE_SLUG_MAP.get(raw) - - -def test_lookup_license_slug() -> None: - - assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" - assert ( - lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") - == "CC-BY" - ) - assert ( - lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") - == "CC-0" - ) - assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" - assert ( - lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") - == "CC-BY-NC-SA" - ) - assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" - assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None - assert lookup_license_slug("") is None - assert lookup_license_slug(None) is None - class CrossrefImporter(EntityImporter): """ diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 441514b8..b310f8bc 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -21,113 +21,19 @@ import langdetect import pycountry from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi, clean_str +from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP +from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug from fatcat_tools.transforms import entity_to_dict -from .common import EntityImporter - -# Cutoff length for abstracts. -MAX_ABSTRACT_LENGTH = 2048 +from .common import MAX_ABSTRACT_LENGTH, EntityImporter # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary -CONTAINER_TYPE_MAP: Dict[str, str] = { +DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = { "Journal": "journal", "Series": "journal", "Book Series": "book-series", } -# The docs/guide should be the canonical home for these mappings; update there -# first. Map various datacite type types to CSL-ish types. None means TODO or -# remove. -DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { - "ris": { - "THES": "thesis", - "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) - "CHAP": "chapter", - "FIGURE": "figure", - "RPRT": "report", - "JOUR": "article-journal", - "MPCT": "motion_picture", - "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset - "BOOK": "book", - "DATA": "dataset", - "COMP": "software", - }, - "schemaOrg": { - "Dataset": "dataset", - "Book": "book", - "ScholarlyArticle": "article-journal", - "ImageObject": "graphic", - "Collection": None, - "MediaObject": None, - "Event": None, - "SoftwareSourceCode": "software", - "Chapter": "chapter", - "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. - "PublicationIssue": "article", - "AudioObject": None, - "Thesis": "thesis", - }, - "citeproc": { - "article": "article", - "article-journal": "article-journal", - "article-magazine": "article-magazine", - "article-newspaper": "article-newspaper", - "bill": "bill", - "book": "book", - "broadcast": "broadcast", - "chapter": "chapter", - "dataset": "dataset", - "entry-dictionary": "entry-dictionary", - "entry-encyclopedia": "entry-encyclopedia", - "entry": "entry", - "figure": "figure", - "graphic": "graphic", - "interview": "interview", - "legal_case": "legal_case", - "legislation": "legislation", - "manuscript": "manuscript", - "map": "map", - "motion_picture": "motion_picture", - "musical_score": "musical_score", - "pamphlet": "pamphlet", - "paper-conference": "paper-conference", - "patent": "patent", - "personal_communication": "personal_communication", - "post": "post", - "post-weblog": "post-weblog", - "report": "report", - "review-book": "review-book", - "review": "review", - "song": "song", - "speech": "speech", - "thesis": "thesis", - "treaty": "treaty", - "webpage": "webpage", - }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types - "bibtex": { - "phdthesis": "thesis", - "inbook": "chapter", - "misc": None, - "article": "article-journal", - "book": "book", - }, - "resourceTypeGeneral": { - "Image": "graphic", - "Dataset": "dataset", - "PhysicalObject": None, - "Collection": None, - "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" - "Sound": None, - "InteractiveResource": None, - "Event": None, - "Software": "software", - "Other": None, - "Workflow": None, - "Audiovisual": None, - }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 -} - # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. DATACITE_UNKNOWN_MARKERS: List[str] = [ "(:unac)", # temporarily inaccessible @@ -180,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [ } ] -# TODO(martin): merge this with other maps and lookup functions, eventually. -LICENSE_SLUG_MAP: Dict[str, str] = { - "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", - "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", - "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", - "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1", - "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET", - "//onlinelibrary.wiley.com/termsandconditions/": "WILEY", - "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN", - "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY", - "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE", - "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC", - "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA", - "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", - "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3", - "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2", - "//www.karger.com/Services/SiteLicenses/": "KARGER", - "//www.springer.com/tdm/": "SPRINGER-TDM", - "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", - "//spdx.org/licenses/CC0-1.0.json": "CC-0", - "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", - "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", - "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", - "//spdx.org/licenses/MIT.json": "MIT", - "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", -} - class DataciteImporter(EntityImporter): """ @@ -406,8 +275,8 @@ class DataciteImporter(EntityImporter): container_name = None container = attributes.get("container", {}) or {} - if container.get("type") in CONTAINER_TYPE_MAP.keys(): - container_type = CONTAINER_TYPE_MAP.get(container["type"]) + if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys(): + container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"]) if container.get("identifier") and container.get("identifierType") == "ISSN": issn = container.get("identifier") if issn and len(issn) == 8: @@ -488,7 +357,7 @@ class DataciteImporter(EntityImporter): license_extra = [] for lic in attributes.get("rightsList", []): - slug = lookup_license_slug(lic.get("rightsUri")) + slug = datacite_lookup_license_slug(lic.get("rightsUri")) if slug: license_slug = slug license_extra.append(lic) @@ -968,7 +837,7 @@ def contributor_list_contains_contributor( return False -def lookup_license_slug(raw: Optional[str]) -> Optional[str]: +def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]: """ Resolve a variety of strings into a some pseudo-canonical form, e.g. CC-BY-ND, CC-0, MIT and so on. @@ -1063,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]: return None return "RS-{}".format(name.upper()) - # Fallback to mapped values. - raw = raw.lower() - raw = raw.strip().replace("http://", "//").replace("https://", "//") - if not raw.endswith("/"): - raw = raw + "/" - return LICENSE_SLUG_MAP.get(raw) + # Fallback to generic license lookup + return lookup_license_slug(raw) def find_original_language_title( diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index f5c886a2..92dbe574 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter from fatcat_tools.normal import ( clean_doi, clean_orcid, @@ -24,9 +24,6 @@ from fatcat_tools.normal import ( parse_month, ) -# Cutoff length for abstracts. -MAX_ABSTRACT_LENGTH = 2048 - class DoajArticleImporter(EntityImporter): def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 9db499a0..3c85132c 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -9,9 +9,7 @@ from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity from fatcat_tools.normal import clean_doi, clean_str -from .common import EntityImporter, make_rel_url - -MAX_ABSTRACT_BYTES = 4096 +from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url class GrobidMetadataImporter(EntityImporter): @@ -84,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter): extra_grobid: Dict[str, Any] = dict() abstract = obj.get("abstract") - if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: + if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", content=clean_str(obj.get("abstract")) ) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index c2f650b0..79691c9a 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -8,7 +8,8 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str +from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC +from fatcat_tools.normal import clean_doi, clean_str from .common import EntityImporter from .crossref import CONTAINER_TYPE_MAP diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3274234f..5bc7a9ff 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,325 +8,16 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.normal import ( +from fatcat_tools.biblio_lookup_tables import ( + COUNTRY_NAME_MAP, LANG_MAP_MARC, - clean_doi, - clean_issn, - clean_pmcid, - clean_pmid, - clean_str, + MONTH_ABBR_MAP, + PUBMED_RELEASE_TYPE_MAP, ) +from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str from .common import EntityImporter -# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly -PUBMED_RELEASE_TYPE_MAP = { - # Adaptive Clinical Trial - "Address": "speech", - "Autobiography": "book", - # Bibliography - "Biography": "book", - # Case Reports - "Classical Article": "article-journal", - # Clinical Conference - # Clinical Study - # Clinical Trial - # Clinical Trial, Phase I - # Clinical Trial, Phase II - # Clinical Trial, Phase III - # Clinical Trial, Phase IV - # Clinical Trial Protocol - # Clinical Trial, Veterinary - # Collected Works - # Comparative Study - # Congress - # Consensus Development Conference - # Consensus Development Conference, NIH - # Controlled Clinical Trial - "Dataset": "dataset", - # Dictionary - # Directory - # Duplicate Publication - "Editorial": "editorial", - # English Abstract # doesn't indicate that this is abstract-only - # Equivalence Trial - # Evaluation Studies - # Expression of Concern - # Festschrift - # Government Document - # Guideline - "Historical Article": "article-journal", - # Interactive Tutorial - "Interview": "interview", - "Introductory Journal Article": "article-journal", - "Journal Article": "article-journal", - "Lecture": "speech", - "Legal Case": "legal_case", - "Legislation": "legislation", - "Letter": "letter", - # Meta-Analysis - # Multicenter Study - # News - "Newspaper Article": "article-newspaper", - # Observational Study - # Observational Study, Veterinary - # Overall - # Patient Education Handout - # Periodical Index - # Personal Narrative - # Portrait - # Practice Guideline - # Pragmatic Clinical Trial - # Publication Components - # Publication Formats - # Publication Type Category - # Randomized Controlled Trial - # Research Support, American Recovery and Reinvestment Act - # Research Support, N.I.H., Extramural - # Research Support, N.I.H., Intramural - # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. - # Research Support, U.S. Gov't, P.H.S. - # Review # in the "literature review" sense, not "product review" - # Scientific Integrity Review - # Study Characteristics - # Support of Research - # Systematic Review - "Technical Report": "report", - # Twin Study - # Validation Studies - # Video-Audio Media - # Webcasts -} - -MONTH_ABBR_MAP = { - "Jan": 1, - "01": 1, - "Feb": 2, - "02": 2, - "Mar": 3, - "03": 3, - "Apr": 4, - "04": 4, - "May": 5, - "05": 5, - "Jun": 6, - "06": 6, - "Jul": 7, - "07": 7, - "Aug": 8, - "08": 8, - "Sep": 9, - "09": 9, - "Oct": 10, - "10": 10, - "Nov": 11, - "11": 11, - "Dec": 12, - "12": 12, -} - -# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ -COUNTRY_NAME_MAP = { - "Afghanistan": "af", - "Albania": "al", - "Algeria": "dz", - "Andorra": "ad", - "Angola": "ao", - "Antigua and Barbuda": "ag", - "Argentina": "ar", - "Armenia": "am", - "Australia": "au", - "Austria": "at", - "Azerbaijan": "az", - "Bahamas": "bs", - "Bahrain": "bh", - "Bangladesh": "bd", - "Barbados": "bb", - "Belarus": "by", - "Belgium": "be", - "Belize": "bz", - "Benin": "bj", - "Bhutan": "bt", - "Bolivia": "bo", - "Bosnia and Herzegowina": "ba", - "Botswana": "bw", - "Brazil": "br", - "Brunei Darussalam": "bn", - "Bulgaria": "bg", - "Burkina Faso": "bf", - "Burundi": "bi", - "Cambodia": "kh", - "Cameroon": "cm", - "Canada": "ca", - "Cape Verde": "cv", - "Central African Republic": "cf", - "Chad": "td", - "Chile": "cl", - "China": "cn", - "Colombia": "co", - "Comoros": "km", - "Congo, Democratic Republic": "cd", - "Congo, People’s Republic": "cg", - "Costa Rica": "cr", - "Cote d'Ivoire": "ci", - "Croatia (Local Name: Hrvatska)": "hr", - "Cuba": "cu", - "Cyprus": "cy", - "Czech Republic": "cz", - "Denmark": "dk", - "Djibouti": "dj", - "Dominica": "dm", - "Dominican Republic": "do", - "East Timor": "tl", - "Ecuador": "ec", - "El Salvador": "sv", - "Equatorial Guinea": "gq", - "Eritrea": "er", - "Estonia": "ee", - "Ethiopia": "et", - "Fiji": "fj", - "Finland": "fi", - "France": "fr", - "Gabon": "ga", - "Gambia": "gm", - "Georgia": "ge", - "Germany": "de", - "Ghana": "gh", - "Greece": "gr", - "Greenland": "gl", - "Grenada": "gd", - "Guatemala": "gt", - "Guinea": "gn", - "Guinea-Bissau": "gw", - "Guyana": "gy", - "Haiti": "ht", - "Honduras": "hn", - "Hong Kong": "hk", - "Hungary": "hu", - "Iceland": "is", - "India": "in", - "Indonesia": "id", - "Iran": "ir", - "Iraq": "iq", - "Ireland": "ie", - "Israel": "il", - "Italy": "it", - "Jamaica": "jm", - "Japan": "jp", - "Jordan": "jo", - "Kazakhstan": "kz", - "Kenya": "ke", - "Kiribati": "ki", - "Korea, Democratic People's Republic": "kp", - "Korea, Republic": "kr", - "Kuwait": "kw", - "Kyrgyzstan": "kg", - "Laos": "la", - "Latvia": "lv", - "Lebanon": "lb", - "Lesotho": "ls", - "Liberia": "lr", - "Libya": "ly", - "Liechtenstein": "li", - "Lithuania": "lt", - "Luxembourg": "lu", - "Macedonia": "mk", - "Madagascar": "mg", - "Malawi": "mw", - "Malaysia": "my", - "Maldives": "mv", - "Mali": "ml", - "Malta": "mt", - "Marshall Islands": "mh", - "Mauritania": "mr", - "Mauritius": "mu", - "Mexico": "mx", - "Micronesia": "fm", - "Moldova": "md", - "Monaco": "mc", - "Mongolia": "mn", - "Morocco": "ma", - "Mozambique": "mz", - "Myanmar": "mm", - "Namibia": "na", - "Nauru": "nr", - "Nepal": "np", - "Netherlands": "nl", - "New Zealand": "nz", - "Nicaragua": "ni", - "Niger": "ne", - "Nigeria": "ng", - "Norway": "no", - "Oman": "om", - "Pakistan": "pk", - "Palau": "pw", - "Panama": "pa", - "Papua New Guinea": "pg", - "Paraguay": "py", - "Peru": "pe", - "Philippines": "ph", - "Poland": "pl", - "Portugal": "pt", - "Puerto Rico": "pr", - "Qatar": "qa", - "Romania": "ro", - "Russian Federation": "ru", - "Rwanda": "rw", - "Saint Kitts and Nevis": "kn", - "Saint Lucia": "lc", - "Saint Vincent and the Grenadines": "vc", - "Samoa": "ws", - "San Marino": "sm", - "Sao Tome and Príncipe": "st", - "Saudi Arabia": "sa", - "Senegal": "sn", - "Serbia and Montenegro": "cs", - "Seychelles": "sc", - "Sierra Leone": "sl", - "Singapore": "sg", - "Slovakia (Slovak Republic)": "sk", - "Slovenia": "si", - "Solomon Islands": "sb", - "Somalia": "so", - "South Africa": "za", - "Spain": "es", - "Sri Lanka": "lk", - "Sudan": "sd", - "Suriname": "sr", - "Swaziland": "sz", - "Sweden": "se", - "Switzerland": "ch", - "Syrian Arab Republic": "sy", - "Taiwan": "tw", - "Tajikistan": "tj", - "Tanzania": "tz", - "Tanzania": "tz", - "Thailand": "th", - "Togo": "tg", - "Tonga": "to", - "Trinidad and Tobago": "tt", - "Tunisia": "tn", - "Turkey": "tr", - "Turkmenistan": "tm", - "Tuvalu": "tv", - "Uganda": "ug", - "Ukraine": "ua", - "United Arab Emirates": "ae", - "United Kingdom": "gb", - "United States": "us", - "Uruguay": "uy", - # Additions from running over large files - "Bosnia and Herzegovina": "ba", - # "International" - "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn - "Russia (Federation)": "ru", - "Scotland": "gb", - "England": "gb", - "Korea (South)": "kr", - "Georgia (Republic)": "ge", - "Egypt": "eg", -} - class PubmedImporter(EntityImporter): """ -- cgit v1.2.3