summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/common.py67
-rw-r--r--python/fatcat_tools/importers/crossref.py51
-rw-r--r--python/fatcat_tools/normal.py19
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py12
4 files changed, 79 insertions, 70 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 680b4f9c..2c4dd496 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -7,7 +7,7 @@ import sqlite3
import datetime
import subprocess
from collections import Counter
-from typing import Optional, Tuple
+from typing import Dict, Any, List, Optional
import lxml
import xml.etree.ElementTree as ET
@@ -26,11 +26,12 @@ import fuzzycat.verify
from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401
from fatcat_tools.transforms import entity_to_dict
-DATE_FMT = "%Y-%m-%d"
-SANE_MAX_RELEASES = 200
-SANE_MAX_URLS = 100
-DOMAIN_REL_MAP = {
+DATE_FMT: str = "%Y-%m-%d"
+SANE_MAX_RELEASES: int = 200
+SANE_MAX_URLS: int = 100
+
+DOMAIN_REL_MAP: Dict[str, str] = {
"archive.org": "archive",
# LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -94,7 +95,7 @@ DOMAIN_REL_MAP = {
"archive.is": "webarchive",
}
-def make_rel_url(raw_url, default_link_rel="web"):
+def make_rel_url(raw_url: str, default_link_rel: str = "web"):
# this is where we map specific domains to rel types, and also filter out
# bad domains, invalid URLs, etc
rel = default_link_rel
@@ -153,33 +154,33 @@ class EntityImporter:
self.api = api
self.do_updates = bool(kwargs.get('do_updates', True))
- self.do_fuzzy_match = kwargs.get('do_fuzzy_match', True)
- self.bezerk_mode = kwargs.get('bezerk_mode', False)
- self.submit_mode = kwargs.get('submit_mode', False)
- self.edit_batch_size = kwargs.get('edit_batch_size', 100)
- self.editgroup_description = kwargs.get('editgroup_description')
- self.editgroup_extra = eg_extra
+ self.do_fuzzy_match: bool = kwargs.get('do_fuzzy_match', True)
+ self.bezerk_mode: bool = kwargs.get('bezerk_mode', False)
+ self.submit_mode: bool = kwargs.get('submit_mode', False)
+ self.edit_batch_size: int = kwargs.get('edit_batch_size', 100)
+ self.editgroup_description: Optional[str] = kwargs.get('editgroup_description')
+ self.editgroup_extra: Optional[Any] = eg_extra
self.es_client = kwargs.get('es_client')
if not self.es_client:
self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120)
- self._issnl_id_map = dict()
- self._orcid_id_map = dict()
+ self._issnl_id_map: Dict[str, Any] = dict()
+ self._orcid_id_map: Dict[str, Any] = dict()
self._orcid_regex = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")
- self._doi_id_map = dict()
- self._pmid_id_map = dict()
+ self._doi_id_map: Dict[str, Any] = dict()
+ self._pmid_id_map: Dict[str, Any] = dict()
self.reset()
- def reset(self):
+ def reset(self) -> None:
self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
- self._edit_count = 0
- self._editgroup_id = None
- self._entity_queue = []
- self._edits_inflight = []
+ self._edit_count: int = 0
+ self._editgroup_id: Optional[str] = None
+ self._entity_queue: List[Any] = []
+ self._edits_inflight: List[Any] = []
- def push_record(self, raw_record):
+ def push_record(self, raw_record: Any) -> None:
"""
Returns nothing.
"""
@@ -198,7 +199,7 @@ class EntityImporter:
self.push_entity(entity)
return
- def parse_record(self, raw_record):
+ def parse_record(self, raw_record: Any) -> Optional[Any]:
"""
Returns an entity class type, or None if we should skip this one.
@@ -282,7 +283,7 @@ class EntityImporter:
self.counts['insert'] += len(self._entity_queue)
self._entity_queue = []
- def want(self, raw_record):
+ def want(self, raw_record: Any) -> bool:
"""
Implementations can override for optional fast-path to drop a record.
Must have no side-effects; returns bool.
@@ -302,14 +303,14 @@ class EntityImporter:
"""
raise NotImplementedError
- def insert_batch(self, raw_record):
+ def insert_batch(self, raw_records: List[Any]):
raise NotImplementedError
- def is_orcid(self, orcid):
+ def is_orcid(self, orcid: str) -> bool:
# TODO: replace with clean_orcid() from fatcat_tools.normal
return self._orcid_regex.match(orcid) is not None
- def lookup_orcid(self, orcid):
+ def lookup_orcid(self, orcid: str):
"""Caches calls to the Orcid lookup API endpoint in a local dict"""
if not self.is_orcid(orcid):
return None
@@ -326,11 +327,11 @@ class EntityImporter:
self._orcid_id_map[orcid] = creator_id # might be None
return creator_id
- def is_doi(self, doi):
+ def is_doi(self, doi: str) -> bool:
# TODO: replace with clean_doi() from fatcat_tools.normal
return doi.startswith("10.") and doi.count("/") >= 1
- def lookup_doi(self, doi):
+ def lookup_doi(self, doi: str):
"""Caches calls to the doi lookup API endpoint in a local dict
For identifier lookups only (not full object fetches)"""
@@ -349,7 +350,7 @@ class EntityImporter:
self._doi_id_map[doi] = release_id # might be None
return release_id
- def lookup_pmid(self, pmid):
+ def lookup_pmid(self, pmid: str):
"""Caches calls to the pmid lookup API endpoint in a local dict
For identifier lookups only (not full object fetches)"""
@@ -366,10 +367,10 @@ class EntityImporter:
self._pmid_id_map[pmid] = release_id # might be None
return release_id
- def is_issnl(self, issnl):
+ def is_issnl(self, issnl: str) -> bool:
return len(issnl) == 9 and issnl[4] == '-'
- def lookup_issnl(self, issnl):
+ def lookup_issnl(self, issnl: str):
"""Caches calls to the ISSN-L lookup API endpoint in a local dict"""
if issnl in self._issnl_id_map:
return self._issnl_id_map[issnl]
@@ -396,7 +397,7 @@ class EntityImporter:
self._issn_issnl_map[issnl] = issnl
print("Got {} ISSN-L mappings.".format(len(self._issn_issnl_map)), file=sys.stderr)
- def issn2issnl(self, issn):
+ def issn2issnl(self, issn: str) -> Optional[str]:
if issn is None:
return None
return self._issn_issnl_map.get(issn)
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index e77fa65e..d4b4a4c7 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -9,7 +9,7 @@ from .common import EntityImporter, clean
# first
# Can get a list of Crossref types (with counts) via API:
# https://api.crossref.org/works?rows=0&facet=type-name:*
-CROSSREF_TYPE_MAP = {
+CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
'book': 'book',
'book-chapter': 'chapter',
'book-part': 'chapter',
@@ -30,7 +30,7 @@ CROSSREF_TYPE_MAP = {
'standard': 'standard',
}
-CONTAINER_TYPE_MAP = {
+CONTAINER_TYPE_MAP: Dict[str, str] = {
'article-journal': 'journal',
'paper-conference': 'conference',
'book': 'book-series',
@@ -41,7 +41,7 @@ CONTAINER_TYPE_MAP = {
# popular are here; many were variants of the CC URLs. Would be useful to
# normalize CC licenses better.
# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP = {
+LICENSE_SLUG_MAP: Dict[str, str] = {
"//creativecommons.org/publicdomain/mark/1.0": "CC-0",
"//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
"//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
@@ -87,7 +87,7 @@ LICENSE_SLUG_MAP = {
"//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
}
-def lookup_license_slug(raw):
+def lookup_license_slug(raw: str) -> Optional[str]:
if not raw:
return None
raw = raw.strip().replace('http://', '//').replace('https://', '//')
@@ -121,9 +121,9 @@ class CrossrefImporter(EntityImporter):
def __init__(self, api, issn_map_file, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
+ eg_desc: Optional[str] = kwargs.get('editgroup_description',
"Automated import of Crossref DOI metadata, harvested from REST API")
- eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter')
super().__init__(api,
issn_map_file=issn_map_file,
@@ -131,9 +131,9 @@ class CrossrefImporter(EntityImporter):
editgroup_extra=eg_extra,
**kwargs)
- self.create_containers = kwargs.get('create_containers', True)
+ self.create_containers: bool = kwargs.get('create_containers', True)
extid_map_file = kwargs.get('extid_map_file')
- self.extid_map_db = None
+ self.extid_map_db: Optional[Any] = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
print("Using external ID map: {}".format(db_uri))
@@ -143,7 +143,7 @@ class CrossrefImporter(EntityImporter):
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi):
+ def lookup_ext_ids(self, doi: str) -> Optional[Any]:
if self.extid_map_db is None:
return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
@@ -161,20 +161,23 @@ class CrossrefImporter(EntityImporter):
jstor_id=None,
)
- def map_release_type(self, crossref_type):
+ def map_release_type(self, crossref_type: str) -> Optional[str]:
return CROSSREF_TYPE_MAP.get(crossref_type)
- def map_container_type(self, crossref_type):
+ def map_container_type(self, crossref_type: Optional[str]) -> Optional[str]:
+ if not crossref_type:
+ return None
return CONTAINER_TYPE_MAP.get(crossref_type)
- def want(self, obj):
+ def want(self, obj: Dict[str, Any]) -> bool:
if not obj.get('title'):
self.counts['skip-blank-title'] += 1
return False
# these are pre-registered DOIs before the actual record is ready
# title is a list of titles
- if obj.get('title')[0].strip().lower() in [
+ titles = obj.get('title')
+ if titles is not None and titles[0].strip().lower() in [
"OUP accepted manuscript".lower(),
]:
self.counts['skip-stub-title'] += 1
@@ -183,7 +186,7 @@ class CrossrefImporter(EntityImporter):
# do most of these checks in-line below
return True
- def parse_record(self, obj):
+ def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
"""
obj is a python dict (parsed from json).
returns a ReleaseEntity
@@ -292,14 +295,15 @@ class CrossrefImporter(EntityImporter):
refs = []
for i, rm in enumerate(obj.get('reference', [])):
try:
- year = int(rm.get('year'))
+ year: Optional[int] = int(rm.get('year'))
# TODO: will need to update/config in the future!
# NOTE: are there crossref works with year < 100?
- if year > 2025 or year < 100:
- year = None
+ if year is not None:
+ if year > 2025 or year < 100:
+ year = None
except (TypeError, ValueError):
year = None
- ref_extra = dict()
+ ref_extra: Dict[str, Any] = dict()
key = rm.get('key')
if key and key.startswith(obj['DOI'].upper()):
key = key.replace(obj['DOI'].upper() + "-", '')
@@ -394,7 +398,7 @@ class CrossrefImporter(EntityImporter):
release_stage = None
# external identifiers
- extids = self.lookup_ext_ids(doi=obj['DOI'].lower())
+ extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower())
# filter out unreasonably huge releases
if len(abstracts) > 100:
@@ -421,11 +425,14 @@ class CrossrefImporter(EntityImporter):
release_year = raw_date[0]
release_date = None
- original_title = None
+
+ original_title: Optional[str] = None
if obj.get('original-title'):
- original_title = clean(obj.get('original-title')[0], force_xml=True)
+ ot = obj.get('original-title')
+ if ot is not None:
+ original_title = clean(ot[0], force_xml=True)
- title = None
+ title: Optional[str] = None
if obj.get('title'):
title = clean(obj.get('title')[0], force_xml=True)
if not title or len(title) <= 1:
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index e37cace8..e27754f5 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -15,7 +15,8 @@ import pycountry
DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
-def clean_doi(raw):
+
+def clean_doi(raw: str) -> Optional[str]:
"""
Removes any:
- padding whitespace
@@ -91,7 +92,7 @@ def test_clean_doi():
ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
-def clean_arxiv_id(raw):
+def clean_arxiv_id(raw: str) -> Optional[str]:
"""
Removes any:
- 'arxiv:' prefix
@@ -162,7 +163,7 @@ def test_clean_wikidata_qid():
assert clean_wikidata_qid("qfba3") == None
assert clean_wikidata_qid("") == None
-def clean_pmid(raw):
+def clean_pmid(raw: str) -> Optional[str]:
if not raw:
return None
raw = raw.strip()
@@ -179,7 +180,7 @@ def test_clean_pmid():
assert clean_pmid("qfba3") == None
assert clean_pmid("") == None
-def clean_pmcid(raw):
+def clean_pmcid(raw: str) -> Optional[str]:
if not raw:
return None
raw = raw.strip()
@@ -189,7 +190,7 @@ def clean_pmcid(raw):
return raw
return None
-def clean_sha1(raw):
+def clean_sha1(raw: str) -> Optional[str]:
if not raw:
return None
raw = raw.strip().lower()
@@ -209,7 +210,7 @@ def test_clean_sha1():
assert clean_sha1("qfba3fba0e1937aa0297de3836b768b5dfb23d7b") == None
assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") == None
-def clean_sha256(raw):
+def clean_sha256(raw: str) -> Optional[str]:
raw = raw.strip().lower()
if len(raw.split()) != 1:
return None
@@ -226,7 +227,7 @@ def test_clean_sha256():
ISSN_REGEX = re.compile(r"^\d{4}-\d{3}[0-9X]$")
-def clean_issn(raw):
+def clean_issn(raw: str) -> Optional[str]:
if not raw:
return None
raw = raw.strip().upper()
@@ -244,7 +245,7 @@ def test_clean_issn():
ISBN13_REGEX = re.compile(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$")
-def clean_isbn13(raw):
+def clean_isbn13(raw: str) -> Optional[str]:
if not raw:
return None
raw = raw.strip()
@@ -260,7 +261,7 @@ def test_clean_isbn13():
ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")
-def clean_orcid(raw):
+def clean_orcid(raw: str) -> Optional[str]:
if not raw:
return None
raw = raw.strip()
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index f7c8e3f3..26eacded 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,6 +1,6 @@
import datetime
-from typing import Optional
+from typing import Dict, List, Any, Optional
import tldextract
@@ -24,7 +24,7 @@ def test_check_kbart() -> None:
assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True
-def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> dict:
+def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> Dict[str, Any]:
"""
Converts from an entity model/schema to elasticsearch oriented schema.
@@ -45,7 +45,7 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) ->
# First, the easy ones (direct copy)
release = entity
- t = dict(
+ t: Dict[str, Any] = dict(
doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z",
ident = release.ident,
state = release.state,
@@ -510,7 +510,7 @@ def container_to_elasticsearch(entity, force_bool=True, stats=None):
return t
-def _type_of_edit(edit):
+def _type_of_edit(edit: EntityEdit) -> str:
if edit.revision == None and edit.redirect_ident == None:
return 'delete'
elif edit.redirect_ident:
@@ -522,7 +522,7 @@ def _type_of_edit(edit):
return 'update'
-def changelog_to_elasticsearch(entity):
+def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]:
"""
Note that this importer requires expanded fill info to work. Calling code
may need to re-fetch editgroup from API to get the 'editor' field. Some of
@@ -577,7 +577,7 @@ def changelog_to_elasticsearch(entity):
return t
-def file_to_elasticsearch(entity):
+def file_to_elasticsearch(entity: FileEntity) -> Dict[str, Any]:
"""
Converts from an entity model/schema to elasticsearch oriented schema.