summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/common.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2021-11-11 01:12:18 +0000
committerbnewbold <bnewbold@archive.org>2021-11-11 01:12:18 +0000
commit6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers/common.py
parent7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
downloadfatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz
fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes. The Datacite-specific stuff could use review here. Remove unused/deprecated/dead code: - cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers - "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used) Refactors: - moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code) - shuffled around relative imports and some function names ("clean_str" vs. "clean") Some actual behavioral changes: - remove some Datacite-specific license slugs - stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!) - remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r--python/fatcat_tools/importers/common.py69
1 files changed, 4 insertions, 65 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 2ec6efda..e2157ee5 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,74 +27,14 @@ from fatcat_openapi_client import (
from fatcat_openapi_client.rest import ApiException
from fuzzycat.matching import match_release_fuzzy
-# TODO: refactor so remove need for this (re-imports for backwards compatibility)
-from fatcat_tools.normal import is_cjk # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex # noqa: F401
-from fatcat_tools.normal import clean_str as clean # noqa: F401
+from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP
+from fatcat_tools.normal import clean_doi
from fatcat_tools.transforms import entity_to_dict
DATE_FMT: str = "%Y-%m-%d"
SANE_MAX_RELEASES: int = 200
SANE_MAX_URLS: int = 100
-
-DOMAIN_REL_MAP: Dict[str, str] = {
- "archive.org": "archive",
- # LOCKSS, Portico, DuraSpace, etc would also be "archive"
- "arxiv.org": "repository",
- "babel.hathitrust.org": "repository",
- "cds.cern.ch": "repository",
- "deepblue.lib.umich.edu": "repository",
- "europepmc.org": "repository",
- "hal.inria.fr": "repository",
- "scielo.isciii.es": "repository",
- "www.dtic.mil": "repository",
- "www.jstage.jst.go.jp": "repository",
- "www.jstor.org": "repository",
- "www.ncbi.nlm.nih.gov": "repository",
- "ftp.ncbi.nlm.nih.gov": "repository",
- "www.scielo.br": "repository",
- "www.scielo.cl": "repository",
- "www.scielo.org.mx": "repository",
- "zenodo.org": "repository",
- "www.biorxiv.org": "repository",
- "www.medrxiv.org": "repository",
- "citeseerx.ist.psu.edu": "aggregator",
- "publisher-connector.core.ac.uk": "aggregator",
- "core.ac.uk": "aggregator",
- "static.aminer.org": "aggregator",
- "aminer.org": "aggregator",
- "pdfs.semanticscholar.org": "aggregator",
- "semanticscholar.org": "aggregator",
- "www.semanticscholar.org": "aggregator",
- "academic.oup.com": "publisher",
- "cdn.elifesciences.org": "publisher",
- "cell.com": "publisher",
- "dl.acm.org": "publisher",
- "downloads.hindawi.com": "publisher",
- "elifesciences.org": "publisher",
- "iopscience.iop.org": "publisher",
- "journals.plos.org": "publisher",
- "link.springer.com": "publisher",
- "onlinelibrary.wiley.com": "publisher",
- "works.bepress.com": "publisher",
- "www.biomedcentral.com": "publisher",
- "www.cell.com": "publisher",
- "www.nature.com": "publisher",
- "www.pnas.org": "publisher",
- "www.tandfonline.com": "publisher",
- "www.frontiersin.org": "publisher",
- "www.degruyter.com": "publisher",
- "www.mdpi.com": "publisher",
- "www.ahajournals.org": "publisher",
- "ehp.niehs.nih.gov": "publisher",
- "journals.tsu.ru": "publisher",
- "www.cogentoa.com": "publisher",
- "www.researchgate.net": "academicsocial",
- "academia.edu": "academicsocial",
- "wayback.archive-it.org": "webarchive",
- "web.archive.org": "webarchive",
- "archive.is": "webarchive",
-}
+MAX_ABSTRACT_LENGTH: int = 2048
def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
@@ -342,8 +282,7 @@ class EntityImporter:
return creator_id
def is_doi(self, doi: str) -> bool:
- # TODO: replace with clean_doi() from fatcat_tools.normal
- return doi.startswith("10.") and doi.count("/") >= 1
+ return clean_doi(doi) is not None
def lookup_doi(self, doi: str) -> Optional[str]:
"""Caches calls to the doi lookup API endpoint in a local dict