aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arabesque.py15
-rw-r--r--python/fatcat_tools/importers/arxiv.py28
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py20
-rw-r--r--python/fatcat_tools/importers/chocula.py15
-rw-r--r--python/fatcat_tools/importers/common.py146
-rw-r--r--python/fatcat_tools/importers/crossref.py26
-rw-r--r--python/fatcat_tools/importers/datacite.py89
-rw-r--r--python/fatcat_tools/importers/dblp_container.py25
-rw-r--r--python/fatcat_tools/importers/dblp_release.py24
-rw-r--r--python/fatcat_tools/importers/doaj_article.py21
-rw-r--r--python/fatcat_tools/importers/file_meta.py11
-rw-r--r--python/fatcat_tools/importers/fileset_generic.py13
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py26
-rw-r--r--python/fatcat_tools/importers/ingest.py81
-rw-r--r--python/fatcat_tools/importers/jalc.py32
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py19
-rw-r--r--python/fatcat_tools/importers/jstor.py23
-rw-r--r--python/fatcat_tools/importers/matched.py15
-rw-r--r--python/fatcat_tools/importers/orcid.py18
-rw-r--r--python/fatcat_tools/importers/pubmed.py27
-rw-r--r--python/fatcat_tools/importers/shadow.py15
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py28
22 files changed, 443 insertions, 274 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index ae4f9049..2fb7be55 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -1,4 +1,7 @@
+from typing import Any, Dict, List, Optional
+
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
@@ -36,7 +39,9 @@ class ArabesqueMatchImporter(EntityImporter):
- a mode to insert bare files even if identifier not known?
"""
- def __init__(self, api, extid_type, require_grobid=True, **kwargs):
+ def __init__(
+ self, api: ApiClient, extid_type: str, require_grobid: bool = True, **kwargs
+ ) -> None:
eg_desc = (
kwargs.get("editgroup_description", None)
@@ -59,7 +64,7 @@ class ArabesqueMatchImporter(EntityImporter):
else:
print("NOT checking GROBID status column")
- def want(self, row):
+ def want(self, row: Any) -> bool:
if self.require_grobid and not row["postproc_status"] == "200":
return False
if (
@@ -76,7 +81,7 @@ class ArabesqueMatchImporter(EntityImporter):
else:
return False
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> Optional[FileEntity]:
extid = row["identifier"].strip()
@@ -131,7 +136,7 @@ class ArabesqueMatchImporter(EntityImporter):
)
return fe
- def try_update(self, fe):
+ def try_update(self, fe: FileEntity) -> bool:
# lookup sha1, or create new entity
existing = None
try:
@@ -182,7 +187,7 @@ class ArabesqueMatchImporter(EntityImporter):
self.counts["update"] += 1
return False
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FileEntity]) -> None:
self.api.create_file_auto_batch(
fatcat_openapi_client.FileAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 0957db2c..1d50dd9a 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -2,9 +2,11 @@ import datetime
import json
import re
import sys
+from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from bs4 import BeautifulSoup
+from fatcat_openapi_client import ApiClient, ReleaseEntity
from pylatexenc.latex2text import LatexNodes2Text
from .common import EntityImporter
@@ -13,7 +15,7 @@ from .crossref import lookup_license_slug
latex2text = LatexNodes2Text()
-def latex_to_text(raw):
+def latex_to_text(raw: str) -> str:
try:
return latex2text.latex_to_text(raw).strip()
except AttributeError:
@@ -22,7 +24,7 @@ def latex_to_text(raw):
return raw.strip()
-def parse_arxiv_authors(raw):
+def parse_arxiv_authors(raw: str) -> List[str]:
if not raw:
return []
raw = raw.replace("*", "")
@@ -41,7 +43,7 @@ def parse_arxiv_authors(raw):
return authors
-def test_parse_arxiv_authors():
+def test_parse_arxiv_authors() -> None:
assert parse_arxiv_authors(
"Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an"
@@ -88,7 +90,7 @@ class ArxivRawImporter(EntityImporter):
the "most recent" version; can be a simple sort?
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = kwargs.get(
"editgroup_description",
@@ -107,15 +109,17 @@ class ArxivRawImporter(EntityImporter):
)
self._test_override = False
- def parse_record(self, record):
+ # TODO: record is really a beautiful soup element, but setting to 'Any' to
+ # make initial type annotations simple
+ def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]:
if not record:
return None
metadata = record.arXivRaw
if not metadata:
return None
- extra = dict()
- extra_arxiv = dict()
+ extra: Dict[str, Any] = dict()
+ extra_arxiv: Dict[str, Any] = dict()
# don't know!
release_type = "article"
@@ -134,7 +138,7 @@ class ArxivRawImporter(EntityImporter):
for i, a in enumerate(authors)
]
- lang = "en" # the vast majority in english
+ lang: Optional[str] = "en" # the vast majority in english
if metadata.comments and metadata.comments.get_text():
comments = metadata.comments.get_text().replace("\n", " ").strip()
extra_arxiv["comments"] = comments
@@ -229,7 +233,7 @@ class ArxivRawImporter(EntityImporter):
).date()
# TODO: source_type?
versions.append(
- fatcat_openapi_client.ReleaseEntity(
+ ReleaseEntity(
work_id=None,
title=title,
# original_title
@@ -261,7 +265,7 @@ class ArxivRawImporter(EntityImporter):
versions[-1].release_stage = "accepted"
return versions
- def try_update(self, versions):
+ def try_update(self, versions: List[ReleaseEntity]) -> bool:
"""
This is pretty complex! There is no batch/bezerk mode for arxiv importer.
@@ -344,7 +348,7 @@ class ArxivRawImporter(EntityImporter):
return False
- def insert_batch(self, batch_batch):
+ def insert_batch(self, batch_batch: List[ReleaseEntity]) -> None:
# there is no batch/bezerk mode for arxiv importer, except for testing
if self._test_override:
for batch in batch_batch:
@@ -360,7 +364,7 @@ class ArxivRawImporter(EntityImporter):
else:
raise NotImplementedError()
- def parse_file(self, handle):
+ def parse_file(self, handle: Any) -> None:
# 1. open with beautiful soup
soup = BeautifulSoup(handle, "xml")
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index e9de42fc..b88117e0 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -7,10 +7,13 @@ import os
import subprocess
import sys
import urllib
+import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
import fatcat_openapi_client
import magic
from fatcat_openapi_client import (
+ ApiClient,
Editgroup,
FilesetEntity,
FilesetFile,
@@ -24,7 +27,7 @@ from .common import clean
from .crossref import lookup_license_slug
-def single_file(prefix, path):
+def single_file(prefix: str, path: str) -> FilesetFile:
full = prefix + path
size_bytes = os.stat(full).st_size
@@ -59,7 +62,7 @@ def single_file(prefix, path):
return fsf
-def make_manifest(base_dir):
+def make_manifest(base_dir: str) -> List[FilesetFile]:
manifest = []
for root, dirs, files in os.walk(base_dir):
for f in files:
@@ -67,7 +70,9 @@ def make_manifest(base_dir):
return manifest
-def cdl_dash_release(meta, extra=None):
+def cdl_dash_release(
+ meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
+) -> ReleaseEntity:
if not extra:
extra = dict()
@@ -124,7 +129,7 @@ def cdl_dash_release(meta, extra=None):
return r
-def make_release_fileset(dat_path):
+def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
if dat_path.endswith("/"):
dat_path = dat_path[:-1]
@@ -170,7 +175,12 @@ def make_release_fileset(dat_path):
return (release, fs)
-def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):
+def auto_cdl_dash_dat(
+ api: ApiClient,
+ dat_path: str,
+ release_id: Optional[str] = None,
+ editgroup_id: Optional[str] = None,
+) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 8d2a89b6..842c7853 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -1,4 +1,7 @@
+from typing import Any, Dict, List, Optional
+
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, ContainerEntity
from .common import EntityImporter, clean
@@ -12,7 +15,7 @@ class ChoculaImporter(EntityImporter):
See guide for details on the many 'extra' fields used here.
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = kwargs.get(
"editgroup_description",
@@ -22,7 +25,7 @@ class ChoculaImporter(EntityImporter):
eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter")
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, raw_record):
+ def want(self, raw_record: Any) -> bool:
if not raw_record.get("ident") and not raw_record.get("_known_issnl"):
self.counts["skip-unknown-new-issnl"] += 1
return False
@@ -30,7 +33,7 @@ class ChoculaImporter(EntityImporter):
return True
return False
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]:
"""
row is a python dict (parsed from JSON).
@@ -75,7 +78,7 @@ class ChoculaImporter(EntityImporter):
elif "journal " in name.lower():
container_type = "journal"
- ce = fatcat_openapi_client.ContainerEntity(
+ ce = ContainerEntity(
issnl=row["issnl"],
issnp=row["extra"].get("issnp"),
issne=row["extra"].get("issne"),
@@ -88,7 +91,7 @@ class ChoculaImporter(EntityImporter):
)
return ce
- def try_update(self, ce):
+ def try_update(self, ce: ContainerEntity) -> bool:
existing = None
if ce.ident:
@@ -193,7 +196,7 @@ class ChoculaImporter(EntityImporter):
# if we got this far, it's a bug
raise NotImplementedError
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ContainerEntity]) -> None:
self.api.create_container_auto_batch(
fatcat_openapi_client.ContainerAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 0b68e5fe..fd472d11 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -7,7 +7,7 @@ import subprocess
import sys
import xml.etree.ElementTree as ET
from collections import Counter
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple
import elasticsearch
import fatcat_openapi_client
@@ -16,7 +16,14 @@ import fuzzycat.verify
import lxml
from bs4 import BeautifulSoup
from confluent_kafka import Consumer, KafkaException
-from fatcat_openapi_client import ReleaseEntity
+from fatcat_openapi_client import (
+ ApiClient,
+ ContainerEntity,
+ EntityEdit,
+ FileEntity,
+ FilesetEntity,
+ ReleaseEntity,
+)
from fatcat_openapi_client.rest import ApiException
from fuzzycat.matching import match_release_fuzzy
@@ -90,7 +97,7 @@ DOMAIN_REL_MAP: Dict[str, str] = {
}
-def make_rel_url(raw_url: str, default_link_rel: str = "web"):
+def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
# this is where we map specific domains to rel types, and also filter out
# bad domains, invalid URLs, etc
rel = default_link_rel
@@ -101,7 +108,7 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"):
return (rel, raw_url)
-def test_make_rel_url():
+def test_make_rel_url() -> None:
assert make_rel_url("http://example.com/thing.pdf")[0] == "web"
assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans"
assert (
@@ -145,7 +152,7 @@ class EntityImporter:
implementors must write insert_batch appropriately
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_extra = kwargs.get("editgroup_extra", dict())
eg_extra["git_rev"] = eg_extra.get(
@@ -212,7 +219,7 @@ class EntityImporter:
# implementations should fill this in
raise NotImplementedError
- def finish(self):
+ def finish(self) -> Counter:
"""
Gets called as cleanup at the end of imports, but can also be called at
any time to "snip off" current editgroup progress. In other words, safe
@@ -238,7 +245,7 @@ class EntityImporter:
return self.counts
- def get_editgroup_id(self, edits=1):
+ def get_editgroup_id(self, edits: int = 1) -> str:
if self._edit_count >= self.edit_batch_size:
if self.submit_mode:
self.api.submit_editgroup(self._editgroup_id)
@@ -257,30 +264,31 @@ class EntityImporter:
self._editgroup_id = eg.editgroup_id
self._edit_count += edits
+ assert self._editgroup_id
return self._editgroup_id
- def create_container(self, entity):
+ def create_container(self, entity: ContainerEntity) -> EntityEdit:
eg_id = self.get_editgroup_id()
self.counts["inserted.container"] += 1
return self.api.create_container(eg_id, entity)
- def create_release(self, entity):
+ def create_release(self, entity: ReleaseEntity) -> EntityEdit:
eg_id = self.get_editgroup_id()
self.counts["inserted.release"] += 1
return self.api.create_release(eg_id, entity)
- def create_file(self, entity):
+ def create_file(self, entity: FileEntity) -> EntityEdit:
eg_id = self.get_editgroup_id()
self.counts["inserted.file"] += 1
return self.api.create_file(eg_id, entity)
- def updated(self):
+ def updated(self) -> None:
"""
Implementations should call this from try_update() if the update was successful
"""
self.counts["update"] += 1
- def push_entity(self, entity):
+ def push_entity(self, entity: Any) -> None:
self._entity_queue.append(entity)
if len(self._entity_queue) >= self.edit_batch_size:
self.insert_batch(self._entity_queue)
@@ -294,7 +302,7 @@ class EntityImporter:
"""
return True
- def try_update(self, raw_record):
+ def try_update(self, raw_record: Any) -> Optional[bool]:
"""
Passed the output of parse_record(). Should try to find an existing
entity and update it (PUT), decide we should do nothing (based on the
@@ -307,15 +315,17 @@ class EntityImporter:
"""
raise NotImplementedError
- def insert_batch(self, raw_records: List[Any]):
+ def insert_batch(self, raw_records: List[Any]) -> None:
raise NotImplementedError
def is_orcid(self, orcid: str) -> bool:
# TODO: replace with clean_orcid() from fatcat_tools.normal
return self._orcid_regex.match(orcid) is not None
- def lookup_orcid(self, orcid: str):
- """Caches calls to the Orcid lookup API endpoint in a local dict"""
+ def lookup_orcid(self, orcid: str) -> Optional[str]:
+ """Caches calls to the Orcid lookup API endpoint in a local dict.
+
+ Returns a creator fatcat ident if found, else None"""
if not self.is_orcid(orcid):
return None
if orcid in self._orcid_id_map:
@@ -335,7 +345,7 @@ class EntityImporter:
# TODO: replace with clean_doi() from fatcat_tools.normal
return doi.startswith("10.") and doi.count("/") >= 1
- def lookup_doi(self, doi: str):
+ def lookup_doi(self, doi: str) -> Optional[str]:
"""Caches calls to the doi lookup API endpoint in a local dict
For identifier lookups only (not full object fetches)"""
@@ -354,7 +364,7 @@ class EntityImporter:
self._doi_id_map[doi] = release_id # might be None
return release_id
- def lookup_pmid(self, pmid: str):
+ def lookup_pmid(self, pmid: str) -> Optional[str]:
"""Caches calls to the pmid lookup API endpoint in a local dict
For identifier lookups only (not full object fetches)"""
@@ -374,7 +384,7 @@ class EntityImporter:
def is_issnl(self, issnl: str) -> bool:
return len(issnl) == 9 and issnl[4] == "-"
- def lookup_issnl(self, issnl: str):
+ def lookup_issnl(self, issnl: str) -> Optional[str]:
"""Caches calls to the ISSN-L lookup API endpoint in a local dict"""
if issnl in self._issnl_id_map:
return self._issnl_id_map[issnl]
@@ -389,7 +399,7 @@ class EntityImporter:
self._issnl_id_map[issnl] = container_id # might be None
return container_id
- def read_issn_map_file(self, issn_map_file):
+ def read_issn_map_file(self, issn_map_file: Sequence) -> None:
print("Loading ISSN map file...", file=sys.stderr)
self._issn_issnl_map = dict()
for line in issn_map_file:
@@ -407,7 +417,7 @@ class EntityImporter:
return self._issn_issnl_map.get(issn)
@staticmethod
- def generic_file_cleanups(existing):
+ def generic_file_cleanups(existing: FileEntity) -> FileEntity:
"""
Conservative cleanup of existing file entities.
@@ -453,7 +463,7 @@ class EntityImporter:
return existing
@staticmethod
- def generic_fileset_cleanups(existing):
+ def generic_fileset_cleanups(existing: FilesetEntity) -> FilesetEntity:
return existing
def match_existing_release_fuzzy(
@@ -520,10 +530,10 @@ class RecordPusher:
wraps an importer and pushes records in to it.
"""
- def __init__(self, importer, **kwargs):
+ def __init__(self, importer: EntityImporter, **kwargs) -> None:
self.importer = importer
- def run(self):
+ def run(self) -> Counter:
"""
This will look something like:
@@ -536,11 +546,11 @@ class RecordPusher:
class JsonLinePusher(RecordPusher):
- def __init__(self, importer, json_file, **kwargs):
+ def __init__(self, importer: EntityImporter, json_file: Sequence, **kwargs) -> None:
self.importer = importer
self.json_file = json_file
- def run(self):
+ def run(self) -> Counter:
for line in self.json_file:
if not line:
continue
@@ -552,11 +562,11 @@ class JsonLinePusher(RecordPusher):
class CsvPusher(RecordPusher):
- def __init__(self, importer, csv_file, **kwargs):
+ def __init__(self, importer: EntityImporter, csv_file: Any, **kwargs) -> None:
self.importer = importer
self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ","))
- def run(self):
+ def run(self) -> Counter:
for line in self.reader:
if not line:
continue
@@ -567,11 +577,11 @@ class CsvPusher(RecordPusher):
class LinePusher(RecordPusher):
- def __init__(self, importer, text_file, **kwargs):
+ def __init__(self, importer: EntityImporter, text_file: Sequence, **kwargs) -> None:
self.importer = importer
self.text_file = text_file
- def run(self):
+ def run(self) -> Counter:
for line in self.text_file:
if not line:
continue
@@ -582,14 +592,21 @@ class LinePusher(RecordPusher):
class SqlitePusher(RecordPusher):
- def __init__(self, importer, db_file, table_name, where_clause="", **kwargs):
+ def __init__(
+ self,
+ importer: EntityImporter,
+ db_file: str,
+ table_name: str,
+ where_clause: str = "",
+ **kwargs
+ ) -> None:
self.importer = importer
self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
self.db.row_factory = sqlite3.Row
self.table_name = table_name
self.where_clause = where_clause
- def run(self):
+ def run(self) -> Counter:
cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause))
for row in cur:
self.importer.push_record(row)
@@ -599,12 +616,18 @@ class SqlitePusher(RecordPusher):
class Bs4XmlLinesPusher(RecordPusher):
- def __init__(self, importer, xml_file, prefix_filter=None, **kwargs):
+ def __init__(
+ self,
+ importer: EntityImporter,
+ xml_file: Sequence,
+ prefix_filter: Optional[str] = None,
+ **kwargs
+ ) -> None:
self.importer = importer
self.xml_file = xml_file
self.prefix_filter = prefix_filter
- def run(self):
+ def run(self) -> Counter:
for line in self.xml_file:
if not line:
continue
@@ -619,12 +642,14 @@ class Bs4XmlLinesPusher(RecordPusher):
class Bs4XmlFilePusher(RecordPusher):
- def __init__(self, importer, xml_file, record_tag, **kwargs):
+ def __init__(
+ self, importer: EntityImporter, xml_file: Any, record_tag: str, **kwargs
+ ) -> None:
self.importer = importer
self.xml_file = xml_file
self.record_tag = record_tag
- def run(self):
+ def run(self) -> Counter:
soup = BeautifulSoup(self.xml_file, "xml")
for record in soup.find_all(self.record_tag):
self.importer.push_record(record)
@@ -654,13 +679,20 @@ class Bs4XmlLargeFilePusher(RecordPusher):
by inner container/release API lookup caches.
"""
- def __init__(self, importer, xml_file, record_tags, use_lxml=False, **kwargs):
+ def __init__(
+ self,
+ importer: EntityImporter,
+ xml_file: Any,
+ record_tags: List[str],
+ use_lxml: bool = False,
+ **kwargs
+ ) -> None:
self.importer = importer
self.xml_file = xml_file
self.record_tags = record_tags
self.use_lxml = use_lxml
- def run(self):
+ def run(self) -> Counter:
if self.use_lxml:
elem_iter = lxml.etree.iterparse(self.xml_file, ["start", "end"], load_dtd=True)
else:
@@ -691,12 +723,14 @@ class Bs4XmlLargeFilePusher(RecordPusher):
class Bs4XmlFileListPusher(RecordPusher):
- def __init__(self, importer, list_file, record_tag, **kwargs):
+ def __init__(
+ self, importer: EntityImporter, list_file: Sequence, record_tag: str, **kwargs
+ ) -> None:
self.importer = importer
self.list_file = list_file
self.record_tag = record_tag
- def run(self):
+ def run(self) -> Counter:
for xml_path in self.list_file:
xml_path = xml_path.strip()
if not xml_path or xml_path.startswith("#"):
@@ -717,7 +751,15 @@ class KafkaBs4XmlPusher(RecordPusher):
Fetch XML for an article from Kafka, parse via Bs4.
"""
- def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
+ def __init__(
+ self,
+ importer: EntityImporter,
+ kafka_hosts: str,
+ kafka_env: str,
+ topic_suffix: str,
+ group: str,
+ **kwargs
+ ) -> None:
self.importer = importer
self.consumer = make_kafka_consumer(
kafka_hosts,
@@ -729,7 +771,7 @@ class KafkaBs4XmlPusher(RecordPusher):
self.poll_interval = kwargs.get("poll_interval", 5.0)
self.consume_batch_size = kwargs.get("consume_batch_size", 25)
- def run(self):
+ def run(self) -> Counter:
count = 0
last_push = datetime.datetime.now()
while True:
@@ -784,7 +826,15 @@ class KafkaBs4XmlPusher(RecordPusher):
class KafkaJsonPusher(RecordPusher):
- def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
+ def __init__(
+ self,
+ importer: EntityImporter,
+ kafka_hosts: str,
+ kafka_env: str,
+ topic_suffix: str,
+ group: str,
+ **kwargs
+ ) -> None:
self.importer = importer
self.consumer = make_kafka_consumer(
kafka_hosts,
@@ -797,7 +847,7 @@ class KafkaJsonPusher(RecordPusher):
self.consume_batch_size = kwargs.get("consume_batch_size", 100)
self.force_flush = kwargs.get("force_flush", False)
- def run(self):
+ def run(self) -> Counter:
count = 0
last_push = datetime.datetime.now()
last_force_flush = datetime.datetime.now()
@@ -862,10 +912,12 @@ class KafkaJsonPusher(RecordPusher):
return counts
-def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat"):
+def make_kafka_consumer(
+ hosts: str, env: str, topic_suffix: str, group: str, kafka_namespace: str = "fatcat"
+) -> Consumer:
topic_name = "{}-{}.{}".format(kafka_namespace, env, topic_suffix)
- def fail_fast(err, partitions):
+ def fail_fast(err: Any, partitions: List[Any]) -> None:
if err is not None:
print("Kafka consumer commit error: {}".format(err))
print("Bailing out...")
@@ -900,7 +952,7 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat
},
}
- def on_rebalance(consumer, partitions):
+ def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None:
for p in partitions:
if p.error:
raise KafkaException(p.error)
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index d0017002..689989d2 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,9 +1,9 @@
import datetime
import sqlite3
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
-from fatcat_openapi_client import ReleaseEntity
+from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
from .common import EntityImporter, clean
@@ -90,7 +90,7 @@ LICENSE_SLUG_MAP: Dict[str, str] = {
}
-def lookup_license_slug(raw: str) -> Optional[str]:
+def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
raw = raw.strip().replace("http://", "//").replace("https://", "//")
@@ -102,7 +102,7 @@ def lookup_license_slug(raw: str) -> Optional[str]:
return LICENSE_SLUG_MAP.get(raw)
-def test_lookup_license_slug():
+def test_lookup_license_slug() -> None:
assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
assert (
@@ -133,13 +133,13 @@ class CrossrefImporter(EntityImporter):
See https://github.com/CrossRef/rest-api-doc for JSON schema notes
"""
- def __init__(self, api, issn_map_file, **kwargs):
+ def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
eg_desc: Optional[str] = kwargs.get(
"editgroup_description",
"Automated import of Crossref DOI metadata, harvested from REST API",
)
- eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict())
+ eg_extra: Dict[str, Any] = kwargs.get("editgroup_extra", dict())
eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter")
super().__init__(
api,
@@ -249,7 +249,7 @@ class CrossrefImporter(EntityImporter):
release_type = self.map_release_type(obj["type"])
# contribs
- def do_contribs(obj_list, ctype):
+ def do_contribs(obj_list: List[Dict[str, Any]], ctype: str) -> List[ReleaseContrib]:
contribs = []
for i, am in enumerate(obj_list):
creator_id = None
@@ -257,15 +257,15 @@ class CrossrefImporter(EntityImporter):
creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])
# Sorry humans :(
if am.get("given") and am.get("family"):
- raw_name = "{} {}".format(am["given"], am["family"])
+ raw_name: Optional[str] = "{} {}".format(am["given"], am["family"])
elif am.get("family"):
raw_name = am["family"]
else:
# TODO: can end up empty
raw_name = am.get("name") or am.get("given")
- extra = dict()
+ extra: Dict[str, Any] = dict()
if ctype == "author":
- index = i
+ index: Optional[int] = i
else:
index = None
raw_affiliation = None
@@ -284,7 +284,7 @@ class CrossrefImporter(EntityImporter):
assert ctype in ("author", "editor", "translator")
raw_name = clean(raw_name)
contribs.append(
- fatcat_openapi_client.ReleaseContrib(
+ ReleaseContrib(
creator_id=creator_id,
index=index,
raw_name=raw_name,
@@ -559,7 +559,7 @@ class CrossrefImporter(EntityImporter):
)
return re
- def try_update(self, re):
+ def try_update(self, re: ReleaseEntity) -> bool:
# lookup existing DOI (don't need to try other ext idents for crossref)
existing = None
@@ -577,7 +577,7 @@ class CrossrefImporter(EntityImporter):
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ReleaseEntity]) -> None:
self.api.create_release_auto_batch(
fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 4c174b0b..7cc5fa20 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -14,11 +14,13 @@ import json
import re
import sqlite3
import sys
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
import dateparser
import fatcat_openapi_client
import langdetect
import pycountry
+from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
from fatcat_tools.normal import clean_doi
from fatcat_tools.transforms import entity_to_dict
@@ -29,7 +31,7 @@ from .common import EntityImporter, clean
MAX_ABSTRACT_LENGTH = 2048
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP = {
+CONTAINER_TYPE_MAP: Dict[str, str] = {
"Journal": "journal",
"Series": "journal",
"Book Series": "book-series",
@@ -38,7 +40,7 @@ CONTAINER_TYPE_MAP = {
# The docs/guide should be the canonical home for these mappings; update there
# first. Map various datacite type types to CSL-ish types. None means TODO or
# remove.
-DATACITE_TYPE_MAP = {
+DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
"ris": {
"THES": "thesis",
"SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report)
@@ -128,7 +130,7 @@ DATACITE_TYPE_MAP = {
}
# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
-DATACITE_UNKNOWN_MARKERS = (
+DATACITE_UNKNOWN_MARKERS: List[str] = [
"(:unac)", # temporarily inaccessible
"(:unal)", # unallowed, suppressed intentionally
"(:unap)", # not applicable, makes no sense
@@ -139,11 +141,11 @@ DATACITE_UNKNOWN_MARKERS = (
"(:null)", # explicitly and meaningfully empty
"(:tba)", # to be assigned or announced later
"(:etal)", # too numerous to list (et alia)
-)
+]
# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking
# unknown values.
-UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(
+UNKNOWN_MARKERS: Set[str] = set(DATACITE_UNKNOWN_MARKERS).union(
set(
(
"NA",
@@ -159,7 +161,7 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(
UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
-DATACITE_TITLE_SPAM_WORDGROUPS = [
+DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
{
"tokens": (
"full",
@@ -180,7 +182,7 @@ DATACITE_TITLE_SPAM_WORDGROUPS = [
]
# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP = {
+LICENSE_SLUG_MAP: Dict[str, str] = {
"//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
"//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
"//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
@@ -222,7 +224,14 @@ class DataciteImporter(EntityImporter):
Importer for datacite records.
"""
- def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs):
+ def __init__(
+ self,
+ api: ApiClient,
+ issn_map_file: Sequence,
+ debug: bool = False,
+ insert_log_file: bool = None,
+ **kwargs
+ ) -> None:
eg_desc = kwargs.get(
"editgroup_description",
@@ -255,7 +264,7 @@ class DataciteImporter(EntityImporter):
print("datacite with debug={}".format(self.debug), file=sys.stderr)
- def lookup_ext_ids(self, doi):
+ def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
"""
Return dictionary of identifiers referring to the same things as the given DOI.
"""
@@ -291,7 +300,7 @@ class DataciteImporter(EntityImporter):
jstor_id=None,
)
- def parse_record(self, obj):
+ def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
"""
Mapping datacite JSON to ReleaseEntity.
"""
@@ -413,7 +422,7 @@ class DataciteImporter(EntityImporter):
# Start with clear stages, e.g. published. TODO(martin): we could
# probably infer a bit more from the relations, e.g.
# "IsPreviousVersionOf" or "IsNewVersionOf".
- release_stage = "published"
+ release_stage: Optional[str] = "published"
# TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,
# we might want something else than 'published'. See also:
@@ -628,7 +637,7 @@ class DataciteImporter(EntityImporter):
release_type = "review"
# Extra information.
- extra_datacite = dict()
+ extra_datacite: Dict[str, Any] = dict()
if license_extra:
extra_datacite["license"] = license_extra
@@ -675,7 +684,7 @@ class DataciteImporter(EntityImporter):
if relations:
extra_datacite["relations"] = relations
- extra = dict()
+ extra: Dict[str, Any] = dict()
# "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0",
# "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555",
@@ -734,7 +743,7 @@ class DataciteImporter(EntityImporter):
return re
@staticmethod
- def datacite_release_type(doi, attributes):
+ def datacite_release_type(doi: str, attributes: Dict[str, Any]) -> Optional[str]:
"""
Release type. Try to determine the release type from a variety of types
supplied in datacite. The "attributes.types.resourceType" is
@@ -766,7 +775,7 @@ class DataciteImporter(EntityImporter):
return release_type
@staticmethod
- def biblio_hacks(re):
+ def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity:
"""
This function handles known special cases. For example,
publisher-specific or platform-specific workarounds.
@@ -817,7 +826,7 @@ class DataciteImporter(EntityImporter):
return re
- def try_update(self, re):
+ def try_update(self, re: ReleaseEntity) -> bool:
"""
When debug is true, write the RE to stdout, not to the database. Might
hide schema mismatch bugs.
@@ -842,7 +851,7 @@ class DataciteImporter(EntityImporter):
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ReleaseEntity]) -> None:
print("inserting batch ({})".format(len(batch)), file=sys.stderr)
if self.insert_log_file:
with open(self.insert_log_file, "a") as f:
@@ -858,7 +867,13 @@ class DataciteImporter(EntityImporter):
)
)
- def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None):
+ def parse_datacite_creators(
+ self,
+ creators: List[Dict[str, Any]],
+ role: str = "author",
+ set_index: bool = True,
+ doi: Optional[str] = None,
+ ) -> List[ReleaseContrib]:
"""
Parses a list of creators into a list of ReleaseContrib objects. Set
set_index to False, if the index contrib field should be left blank.
@@ -868,12 +883,12 @@ class DataciteImporter(EntityImporter):
# "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
# ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
# "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
- contribs = []
+ contribs: List[ReleaseContrib] = []
# Names, that should be ignored right away.
name_blocklist = set(("Occdownload Gbif.Org",))
- i = 0
+ i: Optional[int] = 0
for c in creators:
if not set_index:
i = None
@@ -983,7 +998,9 @@ class DataciteImporter(EntityImporter):
return contribs
-def contributor_list_contains_contributor(contributor_list, contributor):
+def contributor_list_contains_contributor(
+ contributor_list: ReleaseContrib, contributor: ReleaseContrib
+) -> bool:
"""
Given a list of contributors, determine, whether contrib is in that list.
"""
@@ -998,7 +1015,7 @@ def contributor_list_contains_contributor(contributor_list, contributor):
return False
-def lookup_license_slug(raw):
+def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
"""
Resolve a variety of strings into a some pseudo-canonical form, e.g.
CC-BY-ND, CC-0, MIT and so on.
@@ -1101,7 +1118,9 @@ def lookup_license_slug(raw):
return LICENSE_SLUG_MAP.get(raw)
-def find_original_language_title(item, min_length=4, max_questionmarks=3):
+def find_original_language_title(
+ item: Dict[str, Any], min_length: int = 4, max_questionmarks: int = 3
+) -> Optional[str]:
"""
Perform a few checks before returning a potential original language title.
@@ -1126,7 +1145,9 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3):
return None
-def parse_datacite_titles(titles):
+def parse_datacite_titles(
+ titles: List[Dict[str, Any]]
+) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Given a list of title items from datacite, return 3-tuple (title,
original_language_title, subtitle).
@@ -1158,7 +1179,9 @@ def parse_datacite_titles(titles):
return title, original_language_title, subtitle
-def parse_single_date(value):
+def parse_single_date(
+ value: Optional[str],
+) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]:
"""
Given a single string containing a date in arbitrary format, try to return
tuple (date: datetime.date, month: int, year: int).
@@ -1186,10 +1209,12 @@ def parse_single_date(value):
return None, None, None
-def parse_datacite_dates(dates):
+def parse_datacite_dates(
+ dates: List[Dict[str, Any]],
+) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]:
"""
Given a list of date fields (under .dates), return tuple, (release_date,
- release_year).
+ release_month, release_year).
"""
release_date, release_month, release_year = None, None, None
@@ -1226,9 +1251,13 @@ def parse_datacite_dates(dates):
Pattern("%Y", "y"),
)
- def parse_item(item):
+ def parse_item(
+ item: Dict[str, Any]
+ ) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]:
result, value, year_only = None, str(item.get("date", "")) or "", False
- release_date, release_month, release_year = None, None, None
+ release_date: Optional[datetime.date] = None
+ release_month: Optional[int] = None
+ release_year: Optional[int] = None
for layout, granularity in common_patterns:
try:
@@ -1285,7 +1314,7 @@ def parse_datacite_dates(dates):
return release_date, release_month, release_year
-def index_form_to_display_name(s):
+def index_form_to_display_name(s: str) -> str:
"""
Try to convert an index form name, like 'Razis, Panos A' into display_name,
e.g. 'Panos A Razis'.
diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py
index 603a6271..36fe5f00 100644
--- a/python/fatcat_tools/importers/dblp_container.py
+++ b/python/fatcat_tools/importers/dblp_container.py
@@ -4,8 +4,10 @@ pre-scraped in to JSON from HTML pages.
"""
import sys # noqa: F401
+from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, ContainerEntity
from fatcat_tools.importers.common import EntityImporter
from fatcat_tools.normal import clean_str
@@ -13,8 +15,13 @@ from fatcat_tools.normal import clean_str
class DblpContainerImporter(EntityImporter):
def __init__(
- self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs
- ):
+ self,
+ api: ApiClient,
+ issn_map_file: Sequence,
+ dblp_container_map_file: Sequence,
+ dblp_container_map_output: Any,
+ **kwargs
+ ) -> None:
eg_desc = kwargs.get(
"editgroup_description",
@@ -29,7 +36,7 @@ class DblpContainerImporter(EntityImporter):
self.read_issn_map_file(issn_map_file)
print("\t".join(["dblp_prefix", "container_id"]), file=self.dblp_container_map_output)
- def read_dblp_container_map_file(self, dblp_container_map_file) -> None:
+ def read_dblp_container_map_file(self, dblp_container_map_file: Sequence) -> None:
self._dblp_container_map = dict()
print("Loading existing dblp prefix container map file...", file=sys.stderr)
for line in dblp_container_map_file:
@@ -44,15 +51,15 @@ class DblpContainerImporter(EntityImporter):
file=sys.stderr,
)
- def lookup_dblp_prefix(self, prefix):
+ def lookup_dblp_prefix(self, prefix: str) -> Optional[str]:
if not prefix:
return None
return self._dblp_container_map.get(prefix)
- def want(self, raw_record):
+ def want(self, raw_record: Any) -> bool:
return True
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]:
"""
row is a python dict (parsed from JSON).
@@ -77,7 +84,7 @@ class DblpContainerImporter(EntityImporter):
if issnl:
break
- extra = {
+ extra: Dict[str, Any] = {
"dblp": {
"prefix": dblp_prefix,
},
@@ -98,7 +105,7 @@ class DblpContainerImporter(EntityImporter):
)
return ce
- def try_update(self, ce):
+ def try_update(self, ce: ContainerEntity) -> bool:
dblp_prefix = ce.extra["dblp"]["prefix"]
existing = None
@@ -135,7 +142,7 @@ class DblpContainerImporter(EntityImporter):
# shouldn't get here
raise NotImplementedError()
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ContainerEntity]) -> None:
"""
Because we want to print a prefix/container_id match for each row, we
require a special batch insert method
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index e73e5f33..cb56432a 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -24,10 +24,11 @@ import datetime
import json
import sys # noqa: F401
import warnings
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional, Sequence
import bs4
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, ReleaseEntity
from fatcat_tools.importers.common import EntityImporter
from fatcat_tools.normal import (
@@ -44,7 +45,9 @@ from fatcat_tools.transforms import entity_to_dict
class DblpReleaseImporter(EntityImporter):
- def __init__(self, api, dblp_container_map_file=None, **kwargs):
+ def __init__(
+ self, api: ApiClient, dblp_container_map_file: Optional[Sequence] = None, **kwargs
+ ) -> None:
eg_desc = kwargs.get(
"editgroup_description", "Automated import of dblp metadata via XML records"
@@ -70,7 +73,7 @@ class DblpReleaseImporter(EntityImporter):
# "data", # no instances in 2020-11 dump
]
- def read_dblp_container_map_file(self, dblp_container_map_file) -> None:
+ def read_dblp_container_map_file(self, dblp_container_map_file: Optional[Sequence]) -> None:
self._dblp_container_map = dict()
if not dblp_container_map_file:
print(
@@ -91,12 +94,12 @@ class DblpReleaseImporter(EntityImporter):
file=sys.stderr,
)
- def lookup_dblp_prefix(self, prefix):
+ def lookup_dblp_prefix(self, prefix: Optional[str]) -> Optional[str]:
if not prefix:
return None
return self._dblp_container_map.get(prefix)
- def want(self, xml_elem):
+ def want(self, xml_elem: Any) -> bool:
if xml_elem.name not in self.ELEMENT_TYPES:
self.counts["skip-type"] += 1
return False
@@ -108,7 +111,8 @@ class DblpReleaseImporter(EntityImporter):
return False
return True
- def parse_record(self, xml_elem):
+ # TODO: xml_elem could be typed instead of 'Any' for better type checking
+ def parse_record(self, xml_elem: Any) -> Optional[ReleaseEntity]:
"""
- title
=> may contain <i>, <sub>, <sup>, <tt>
@@ -255,7 +259,7 @@ class DblpReleaseImporter(EntityImporter):
dblp_extra["part_of_key"] = part_of_key
# generic extra
- extra = dict()
+ extra: Dict[str, Any] = dict()
if not container_id and container_name:
extra["container_name"] = container_name
@@ -312,14 +316,14 @@ class DblpReleaseImporter(EntityImporter):
return re
@staticmethod
- def biblio_hacks(re):
+ def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity:
"""
This function handles known special cases. For example,
publisher-specific or platform-specific workarounds.
"""
return re
- def try_update(self, re):
+ def try_update(self, re: ReleaseEntity) -> bool:
# lookup existing release by dblp article id
existing = None
@@ -411,7 +415,7 @@ class DblpReleaseImporter(EntityImporter):
return False
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ReleaseEntity]) -> None:
self.api.create_release_auto_batch(
fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 56045ea7..9ff4f3fb 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -6,9 +6,10 @@ DOAJ API schema and docs: https://doaj.org/api/v1/docs
import datetime
import warnings
-from typing import List, Optional
+from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, ReleaseEntity
from fatcat_tools.importers.common import EntityImporter
from fatcat_tools.normal import (
@@ -28,7 +29,7 @@ MAX_ABSTRACT_LENGTH = 2048
class DoajArticleImporter(EntityImporter):
- def __init__(self, api, issn_map_file, **kwargs):
+ def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
eg_desc = kwargs.get(
"editgroup_description",
@@ -49,10 +50,10 @@ class DoajArticleImporter(EntityImporter):
self.this_year = datetime.datetime.now().year
self.read_issn_map_file(issn_map_file)
- def want(self, obj):
+ def want(self, raw_record: Dict[str, Any]) -> bool:
return True
- def parse_record(self, obj):
+ def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
"""
bibjson {
abstract (string, optional),
@@ -108,7 +109,7 @@ class DoajArticleImporter(EntityImporter):
publisher = clean_str(bibjson["journal"].get("publisher"))
try:
- release_year = int(bibjson.get("year"))
+ release_year: Optional[int] = int(bibjson.get("year"))
except (TypeError, ValueError):
release_year = None
release_month = parse_month(clean_str(bibjson.get("month")))
@@ -148,7 +149,7 @@ class DoajArticleImporter(EntityImporter):
contribs = self.doaj_contribs(bibjson.get("author") or [])
# DOAJ-specific extra
- doaj_extra = dict()
+ doaj_extra: Dict[str, Any] = dict()
if bibjson.get("subject"):
doaj_extra["subject"] = bibjson.get("subject")
if bibjson.get("keywords"):
@@ -157,7 +158,7 @@ class DoajArticleImporter(EntityImporter):
]
# generic extra
- extra = dict()
+ extra: Dict[str, Any] = dict()
if country:
extra["country"] = country
if not container_id and container_name:
@@ -194,14 +195,14 @@ class DoajArticleImporter(EntityImporter):
return re
@staticmethod
- def biblio_hacks(re):
+ def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity:
"""
This function handles known special cases. For example,
publisher-specific or platform-specific workarounds.
"""
return re
- def try_update(self, re):
+ def try_update(self, re: ReleaseEntity) -> bool:
# lookup existing release by DOAJ article id
existing = None
@@ -276,7 +277,7 @@ class DoajArticleImporter(EntityImporter):
return False
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ReleaseEntity]) -> None:
self.api.create_release_auto_batch(
fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
index 26584ff3..892c1dcd 100644
--- a/python/fatcat_tools/importers/file_meta.py
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -1,4 +1,7 @@
+from typing import Any, Dict
+
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
from .common import EntityImporter
@@ -14,7 +17,7 @@ class FileMetaImporter(EntityImporter):
imported which were missing file size, mimetype, md5, and/or sha256.
"""
- def __init__(self, api, require_grobid=True, **kwargs):
+ def __init__(self, api: ApiClient, require_grobid: bool = True, **kwargs):
eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates"
eg_extra = kwargs.pop("editgroup_extra", dict())
@@ -22,14 +25,14 @@ class FileMetaImporter(EntityImporter):
kwargs["do_updates"] = kwargs.get("do_updates", True)
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Any) -> bool:
for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"):
if not row.get(k):
self.counts["skip-missing-field"] += 1
return False
return True
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> FileEntity:
# bezerk mode doesn't make sense for this importer
assert self.bezerk_mode is False
@@ -44,7 +47,7 @@ class FileMetaImporter(EntityImporter):
)
return fe
- def try_update(self, fe):
+ def try_update(self, fe: FileEntity) -> bool:
# lookup sha1, or create new entity
existing = None
diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py
index dd8f5600..2207b938 100644
--- a/python/fatcat_tools/importers/fileset_generic.py
+++ b/python/fatcat_tools/importers/fileset_generic.py
@@ -1,4 +1,7 @@
+from typing import Any, Dict, List, Optional
+
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FilesetEntity
from fatcat_tools import entity_from_dict
@@ -17,7 +20,7 @@ class FilesetImporter(EntityImporter):
Currently only creates (insert), no updates.
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import"
eg_extra = kwargs.pop("editgroup_extra", dict())
@@ -29,7 +32,7 @@ class FilesetImporter(EntityImporter):
# bezerk mode doesn't make sense for this importer
assert self.bezerk_mode is False
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
if not row.get("release_ids"):
self.counts["skip-no-release-ids"] += 1
return False
@@ -47,7 +50,7 @@ class FilesetImporter(EntityImporter):
return False
return True
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> Optional[FilesetEntity]:
fse = entity_from_dict(
row,
@@ -57,7 +60,7 @@ class FilesetImporter(EntityImporter):
fse = self.generic_fileset_cleanups(fse)
return fse
- def try_update(self, fse):
+ def try_update(self, fse: FilesetEntity) -> bool:
if not self.skip_release_fileset_check:
for release_id in fse.release_ids:
@@ -74,7 +77,7 @@ class FilesetImporter(EntityImporter):
# do the insert
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FilesetEntity]) -> None:
self.api.create_fileset_auto_batch(
fatcat_openapi_client.FilesetAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index f7bb5357..830c9bbb 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -2,8 +2,10 @@
import base64
import json
+from typing import Any, Dict, List, Optional
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
from .common import EntityImporter, clean, make_rel_url
@@ -22,7 +24,7 @@ class GrobidMetadataImporter(EntityImporter):
TODO: relaxing 'None' constraint on parse_record() might make this refactor-able.
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = kwargs.get(
"editgroup_description",
@@ -34,10 +36,10 @@ class GrobidMetadataImporter(EntityImporter):
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.longtail_oa = kwargs.get("longtail_oa", False)
- def want(self, raw_record):
+ def want(self, raw_record: Any) -> bool:
return True
- def parse_record(self, row):
+ def parse_record(self, row: str) -> Optional[FileEntity]:
fields = row.split("\t")
sha1_key = fields[0]
@@ -72,12 +74,12 @@ class GrobidMetadataImporter(EntityImporter):
fe.release_ids.append(release_edit.ident)
return fe
- def parse_grobid_json(self, obj):
+ def parse_grobid_json(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
if not obj.get("title"):
return None
- extra_grobid = dict()
+ extra_grobid: Dict[str, Any] = dict()
abstract = obj.get("abstract")
if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
@@ -103,7 +105,7 @@ class GrobidMetadataImporter(EntityImporter):
refs = []
for raw in obj.get("citations", []):
- cite_extra = dict()
+ cite_extra: Dict[str, Any] = dict()
year = None
if raw.get("date"):
try:
@@ -162,13 +164,15 @@ class GrobidMetadataImporter(EntityImporter):
publisher=clean(obj["journal"].get("publisher")),
volume=clean(obj["journal"].get("volume")),
issue=clean(obj["journal"].get("issue")),
- abstracts=abstracts,
+ abstracts=abstracts or None,
ext_ids=fatcat_openapi_client.ReleaseExtIds(),
- extra=extra,
+ extra=extra or None,
)
return re
- def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):
+ def parse_file_metadata(
+ self, sha1_key: str, cdx: Dict[str, Any], mimetype: str, file_size: int
+ ) -> FileEntity:
sha1 = (
base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", "")))
@@ -197,11 +201,11 @@ class GrobidMetadataImporter(EntityImporter):
return fe
- def try_update(self, entity):
+ def try_update(self, re: FileEntity) -> bool:
# did the exists check in 'parse_record()', because we needed to create a release
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FileEntity]) -> None:
self.api.create_file_auto_batch(
fatcat_openapi_client.FileAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index e0a6c3f5..e13ce4bd 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -1,12 +1,23 @@
import datetime
+from typing import Any, Dict, List, Optional
import fatcat_openapi_client
+from fatcat_openapi_client import (
+ ApiClient,
+ FileEntity,
+ FilesetEntity,
+ FilesetUrl,
+ FileUrl,
+ WebcaptureEntity,
+)
from .common import EntityImporter, make_rel_url
class IngestFileResultImporter(EntityImporter):
- def __init__(self, api, require_grobid=True, **kwargs):
+ def __init__(
+ self, api: fatcat_openapi_client.ApiClient, require_grobid: bool = True, **kwargs
+ ) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -41,7 +52,7 @@ class IngestFileResultImporter(EntityImporter):
if kwargs.get("skip_source_allowlist", False):
self.ingest_request_source_allowlist = []
- def want_file(self, row) -> bool:
+ def want_file(self, row: Dict[str, Any]) -> bool:
"""
File-specific part of want(). Generic across general ingest and save-paper-now.
"""
@@ -76,7 +87,7 @@ class IngestFileResultImporter(EntityImporter):
return True
- def want_ingest(self, row) -> bool:
+ def want_ingest(self, row: Dict[str, Any]) -> bool:
"""
Sandcrawler ingest-specific part of want(). Generic across file and
webcapture ingest.
@@ -115,7 +126,7 @@ class IngestFileResultImporter(EntityImporter):
return True
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
"""
Overall logic here probably needs work (TODO):
@@ -137,7 +148,7 @@ class IngestFileResultImporter(EntityImporter):
return True
- def parse_ingest_release_ident(self, row):
+ def parse_ingest_release_ident(self, row: Dict[str, Any]) -> Optional[str]:
request = row["request"]
fatcat = request.get("fatcat")
@@ -178,7 +189,7 @@ class IngestFileResultImporter(EntityImporter):
return release_ident
- def parse_terminal(self, row):
+ def parse_terminal(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
terminal = row.get("terminal")
if not terminal:
# support old cdx-only ingest results
@@ -206,7 +217,7 @@ class IngestFileResultImporter(EntityImporter):
)
return terminal
- def parse_urls(self, row, terminal):
+ def parse_urls(self, row: Dict[str, Any], terminal: Dict[str, Any]) -> List[FileUrl]:
request = row["request"]
@@ -224,10 +235,10 @@ class IngestFileResultImporter(EntityImporter):
)
urls = [url, ("webarchive", wayback)]
- urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+ urls = [FileUrl(rel=rel, url=url) for (rel, url) in urls]
return urls
- def parse_edit_extra(self, row):
+ def parse_edit_extra(self, row: Dict[str, Any]) -> Dict[str, Any]:
request = row["request"]
edit_extra = dict()
@@ -251,7 +262,7 @@ class IngestFileResultImporter(EntityImporter):
return edit_extra
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> FileEntity:
request = row["request"]
file_meta = row["file_meta"]
@@ -283,7 +294,7 @@ class IngestFileResultImporter(EntityImporter):
urls = self.parse_urls(row, terminal)
- fe = fatcat_openapi_client.FileEntity(
+ fe = FileEntity(
md5=file_meta["md5hex"],
sha1=file_meta["sha1hex"],
sha256=file_meta["sha256hex"],
@@ -298,7 +309,7 @@ class IngestFileResultImporter(EntityImporter):
fe.edit_extra = edit_extra
return fe
- def try_update(self, fe):
+ def try_update(self, fe: FileEntity) -> bool:
# lookup sha1, or create new entity
existing = None
try:
@@ -330,7 +341,7 @@ class IngestFileResultImporter(EntityImporter):
self.counts["skip-update-disabled"] += 1
return False
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FileEntity]) -> None:
if self.submit_mode:
eg = self.api.create_editgroup(
fatcat_openapi_client.Editgroup(
@@ -358,7 +369,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
them for further human review (as opposed to accepting by default).
"""
- def __init__(self, api, submit_mode=True, **kwargs):
+ def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -371,7 +382,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
kwargs["do_updates"] = False
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
source = row["request"].get("ingest_request_source")
if not source:
@@ -397,7 +408,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
into webcapture objects.
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -408,7 +419,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
kwargs["do_updates"] = False
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
if not self.want_ingest(row):
return False
@@ -426,7 +437,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
return True
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> Optional[WebcaptureEntity]:
request = row["request"]
file_meta = row["file_meta"]
@@ -512,7 +523,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
wc.edit_extra = edit_extra
return wc
- def try_update(self, wc):
+ def try_update(self, wc: WebcaptureEntity) -> bool:
# check for existing edits-in-progress with same URL
for other in self._entity_queue:
@@ -539,7 +550,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
# so go ahead and insert!
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[WebcaptureEntity]) -> None:
if self.submit_mode:
eg = self.api.create_editgroup(
fatcat_openapi_client.Editgroup(
@@ -565,7 +576,7 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
Like SavePaperNowFileImporter, but for webcapture (HTML) ingest.
"""
- def __init__(self, api, submit_mode=True, **kwargs):
+ def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -577,7 +588,7 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
kwargs["do_updates"] = False
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
"""
Relatively custom want() here, a synthesis of other filters.
@@ -617,7 +628,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
results into fileset objects.
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -629,7 +640,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.max_file_count = 300
- def want_fileset(self, row):
+ def want_fileset(self, row: Dict[str, Any]) -> bool:
if not row.get("manifest") or len(row.get("manifest")) == 0:
self.counts["skip-empty-manifest"] += 1
@@ -645,7 +656,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
if not self.want_ingest(row):
return False
@@ -662,7 +673,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
- def parse_fileset_urls(self, row):
+ def parse_fileset_urls(self, row: Dict[str, Any]) -> List[FilesetUrl]:
if not row.get("strategy"):
return []
strategy = row["strategy"]
@@ -717,7 +728,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
)
return urls
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> FilesetEntity:
request = row["request"]
@@ -735,7 +746,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
self.counts["skip-release-not-found"] += 1
return None
- entity_extra = dict()
+ entity_extra: Dict[str, Any] = dict()
edit_extra = self.parse_edit_extra(row)
edit_extra["ingest_strategy"] = row["ingest_strategy"]
if row.get("platform"):
@@ -789,12 +800,12 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
fe.edit_extra = edit_extra
return fe
- def try_update(self, wc):
+ def try_update(self, fse: FilesetEntity) -> bool:
# check for existing edits-in-progress with same URL
for other in self._entity_queue:
# XXX: how to duplicate check?
- if other.original_url == wc.original_url:
+ if other.original_url == fse.original_url:
self.counts["skip-in-queue"] += 1
return False
@@ -802,12 +813,12 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
# existing = None
# NOTE: in lieu of existing checks (by lookup), only allow one fileset per release
- release = self.api.get_release(wc.release_ids[0], expand="filesets")
+ release = self.api.get_release(fse.release_ids[0], expand="filesets")
if release.filesets:
# XXX: how to duplicate check filesets?
# check if this is an existing match, or just a similar hit
for other in release.filesets:
- if wc.original_url == other.original_url:
+ if fse.original_url == other.original_url:
# TODO: compare very similar timestamps of same time (different formats)
self.counts["exists"] += 1
return False
@@ -816,7 +827,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FilesetEntity]) -> None:
if self.submit_mode:
eg = self.api.create_editgroup(
fatcat_openapi_client.Editgroup(
@@ -842,7 +853,7 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
Like SavePaperNowFileImporter, but for fileset/dataset ingest.
"""
- def __init__(self, api, submit_mode=True, **kwargs):
+ def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -854,7 +865,7 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
kwargs["do_updates"] = False
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
source = row["request"].get("ingest_request_source")
if not source:
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index a7e06e6a..f540c264 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,16 +1,19 @@
import datetime
import sqlite3
import sys
+from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from bs4 import BeautifulSoup
+from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
from fatcat_tools.normal import clean_doi
from .common import DATE_FMT, EntityImporter, clean, is_cjk
-def parse_jalc_persons(raw_persons):
+# TODO: should be List[Tag] not List[Any] for full type annotations
+def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
"""
For the most part, JALC DC names are in either japanese or english. The
two common patterns are a list alternating between the two (in which case
@@ -47,7 +50,7 @@ def parse_jalc_persons(raw_persons):
if lang == "en" and surname and given_name:
# english names order is flipped
name = "{} {}".format(given_name, surname)
- rc = fatcat_openapi_client.ReleaseContrib(
+ rc = ReleaseContrib(
raw_name=name, surname=surname, given_name=given_name, role="author"
)
# add an extra hint field; won't end up in serialized object
@@ -100,7 +103,7 @@ class JalcImporter(EntityImporter):
NOTE: some JALC DOIs seem to get cross-registered with Crossref
"""
- def __init__(self, api, issn_map_file, **kwargs):
+ def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata")
eg_extra = kwargs.get("editgroup_extra", dict())
@@ -125,7 +128,7 @@ class JalcImporter(EntityImporter):
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi):
+ def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
if self.extid_map_db is None:
return dict(
core_id=None,
@@ -158,10 +161,12 @@ class JalcImporter(EntityImporter):
jstor_id=None,
)
- def want(self, obj):
+ def want(self, raw_record: Any) -> bool:
return True
- def parse_record(self, record):
+ # TODO: mypy annotations partially skipped on this function ('Any' instead of 'Tag')
+ # for now because # XML # parsing # annotations are large and complex
+ def parse_record(self, record: Any) -> Optional[ReleaseEntity]:
"""
record is a beautiful soup object
returns a ReleaseEntity, or None
@@ -170,8 +175,8 @@ class JalcImporter(EntityImporter):
fields.
"""
- extra = dict()
- extra_jalc = dict()
+ extra: Dict[str, Any] = dict()
+ extra_jalc: Dict[str, Any] = dict()
titles = record.find_all("title")
if not titles:
@@ -254,7 +259,7 @@ class JalcImporter(EntityImporter):
publisher = None
container_name = None
- container_extra = dict()
+ container_extra: Dict[str, Any] = dict()
if record.publicationName:
pubs = [
@@ -335,7 +340,7 @@ class JalcImporter(EntityImporter):
if not title:
return None
- re = fatcat_openapi_client.ReleaseEntity(
+ re = ReleaseEntity(
work_id=None,
title=title,
original_title=clean(original_title),
@@ -364,7 +369,7 @@ class JalcImporter(EntityImporter):
)
return re
- def try_update(self, re):
+ def try_update(self, re: ReleaseEntity) -> bool:
# lookup existing DOI
existing = None
@@ -384,7 +389,7 @@ class JalcImporter(EntityImporter):
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ReleaseEntity]) -> None:
self.api.create_release_auto_batch(
fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
@@ -394,7 +399,7 @@ class JalcImporter(EntityImporter):
)
)
- def parse_file(self, handle):
+ def parse_file(self, handle: Any) -> None:
"""
Helper for testing; can run this file stand-alone instead of using a pusher
"""
@@ -408,4 +413,3 @@ class JalcImporter(EntityImporter):
# print(json.dumps(resp))
print(resp)
# sys.exit(-1)
-
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index 6d1fefa3..a45e49f3 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -1,9 +1,12 @@
+from typing import Any, Dict, List, Optional
+
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, ContainerEntity
from .common import EntityImporter, clean
-def or_none(s):
+def or_none(s: Optional[str]) -> Optional[str]:
if s is None:
return None
if len(s) == 0:
@@ -11,7 +14,7 @@ def or_none(s):
return s
-def truthy(s):
+def truthy(s: Optional[str]) -> Optional[bool]:
if s is None:
return None
s = s.lower()
@@ -32,7 +35,7 @@ class JournalMetadataImporter(EntityImporter):
See guide for details on the many 'extra' fields used here.
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = kwargs.get(
"editgroup_description",
@@ -42,12 +45,12 @@ class JournalMetadataImporter(EntityImporter):
eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter")
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, raw_record):
+ def want(self, raw_record: Any) -> bool:
if raw_record.get("issnl") and raw_record.get("name"):
return True
return False
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]:
"""
row is a python dict (parsed from JSON).
@@ -106,7 +109,7 @@ class JournalMetadataImporter(EntityImporter):
if not name:
return None
- ce = fatcat_openapi_client.ContainerEntity(
+ ce = ContainerEntity(
issnl=row["issnl"],
issne=row.get("issne"),
issnp=row.get("issnp"),
@@ -118,7 +121,7 @@ class JournalMetadataImporter(EntityImporter):
)
return ce
- def try_update(self, ce):
+ def try_update(self, ce: ContainerEntity) -> bool:
existing = None
try:
@@ -148,7 +151,7 @@ class JournalMetadataImporter(EntityImporter):
# if we got this far, it's a bug
raise NotImplementedError
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ContainerEntity]) -> None:
self.api.create_container_auto_batch(
fatcat_openapi_client.ContainerAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 287fb308..0a6eec65 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -2,9 +2,11 @@ import datetime
import json
import sys
import warnings
+from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from bs4 import BeautifulSoup
+from fatcat_openapi_client import ApiClient, ReleaseEntity
from .common import LANG_MAP_MARC, EntityImporter, clean
from .crossref import CONTAINER_TYPE_MAP
@@ -32,7 +34,7 @@ class JstorImporter(EntityImporter):
Collection)
"""
- def __init__(self, api, issn_map_file, **kwargs):
+ def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata")
eg_extra = kwargs.get("editgroup_extra", dict())
@@ -49,19 +51,22 @@ class JstorImporter(EntityImporter):
self.read_issn_map_file(issn_map_file)
- def map_container_type(self, crossref_type):
+ def map_container_type(self, crossref_type: Optional[str]) -> Optional[str]:
return CONTAINER_TYPE_MAP.get(crossref_type)
- def want(self, obj):
+ def want(self, raw_record: Any) -> bool:
return True
- def parse_record(self, article):
+ # TODO: mypy annotations partially skipped on this function ('Any' instead of
+ # 'BeautifulSoup') for now because XML parsing annotations are large and
+ # complex
+ def parse_record(self, article: Any) -> Optional[ReleaseEntity]:
journal_meta = article.front.find("journal-meta")
article_meta = article.front.find("article-meta")
- extra = dict()
- extra_jstor = dict()
+ extra: Dict[str, Any] = dict()
+ extra_jstor: Dict[str, Any] = dict()
release_type = JSTOR_TYPE_MAP.get(article["article-type"])
title = article_meta.find("article-title")
@@ -269,7 +274,7 @@ class JstorImporter(EntityImporter):
)
return re
- def try_update(self, re):
+ def try_update(self, re: ReleaseEntity) -> bool:
# first, lookup existing by JSTOR id (which much be defined)
existing = None
@@ -313,7 +318,7 @@ class JstorImporter(EntityImporter):
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ReleaseEntity]) -> None:
self.api.create_release_auto_batch(
fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
@@ -323,7 +328,7 @@ class JstorImporter(EntityImporter):
)
)
- def parse_file(self, handle):
+ def parse_file(self, handle: Any) -> None:
# 1. open with beautiful soup
soup = BeautifulSoup(handle, "xml")
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 7c2a6a87..9c80dd72 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -1,4 +1,7 @@
+from typing import Any, Dict, List, Optional
+
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
from fatcat_tools.normal import clean_doi
@@ -29,7 +32,7 @@ class MatchedImporter(EntityImporter):
- core_id, wikidata_id, pmcid, pmid: not as lists
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -41,10 +44,10 @@ class MatchedImporter(EntityImporter):
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.default_mimetype = kwargs.get("default_mimetype", None)
- def want(self, raw_record):
+ def want(self, raw_record: Any) -> bool:
return True
- def parse_record(self, obj):
+ def parse_record(self, obj: Dict[str, Any]) -> Optional[FileEntity]:
dois = [d.lower() for d in obj.get("dois", [])]
# lookup dois
@@ -129,7 +132,7 @@ class MatchedImporter(EntityImporter):
if urls[0].url.endswith(".pdf"):
mimetype = "application/pdf"
- fe = fatcat_openapi_client.FileEntity(
+ fe = FileEntity(
md5=obj.get("md5"),
sha1=obj["sha1"],
sha256=obj.get("sha256"),
@@ -140,7 +143,7 @@ class MatchedImporter(EntityImporter):
)
return fe
- def try_update(self, fe):
+ def try_update(self, fe: FileEntity) -> bool:
# lookup sha1, or create new entity
existing = None
try:
@@ -207,7 +210,7 @@ class MatchedImporter(EntityImporter):
self.counts["update"] += 1
return False
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FileEntity]) -> None:
self.api.create_file_auto_batch(
fatcat_openapi_client.FileAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index b514e6e5..430cdd0f 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -1,11 +1,13 @@
import sys
+from typing import Any, Dict, List, Optional
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, CreatorEntity
from .common import EntityImporter, clean
-def value_or_none(e):
+def value_or_none(e: Any) -> Any:
if type(e) == dict:
e = e.get("value")
if type(e) == str and len(e) == 0:
@@ -22,7 +24,7 @@ def value_or_none(e):
class OrcidImporter(EntityImporter):
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = kwargs.get(
"editgroup_description",
@@ -32,10 +34,10 @@ class OrcidImporter(EntityImporter):
eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter")
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, raw_record):
+ def want(self, raw_record: Any) -> bool:
return True
- def parse_record(self, obj):
+ def parse_record(self, obj: Dict[str, Any]) -> Optional[CreatorEntity]:
"""
obj is a python dict (parsed from json).
returns a CreatorEntity
@@ -67,7 +69,7 @@ class OrcidImporter(EntityImporter):
if not display:
# must have *some* name
return None
- ce = fatcat_openapi_client.CreatorEntity(
+ ce = CreatorEntity(
orcid=orcid,
given_name=clean(given),
surname=clean(sur),
@@ -76,10 +78,10 @@ class OrcidImporter(EntityImporter):
)
return ce
- def try_update(self, raw_record):
+ def try_update(self, ce: CreatorEntity) -> bool:
existing = None
try:
- existing = self.api.lookup_creator(orcid=raw_record.orcid)
+ existing = self.api.lookup_creator(orcid=ce.orcid)
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
@@ -92,7 +94,7 @@ class OrcidImporter(EntityImporter):
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[CreatorEntity]) -> None:
self.api.create_creator_auto_batch(
fatcat_openapi_client.CreatorAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 97433445..41268925 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -2,9 +2,11 @@ import datetime
import json
import sys
import warnings
+from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from bs4 import BeautifulSoup
+from fatcat_openapi_client import ApiClient, ReleaseEntity
from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid
@@ -328,7 +330,9 @@ class PubmedImporter(EntityImporter):
TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
"""
- def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs):
+ def __init__(
+ self, api: ApiClient, issn_map_file: Sequence, lookup_refs: bool = True, **kwargs
+ ):
eg_desc = kwargs.get(
"editgroup_description", "Automated import of PubMed/MEDLINE XML metadata"
@@ -347,10 +351,13 @@ class PubmedImporter(EntityImporter):
self.create_containers = kwargs.get("create_containers", True)
self.read_issn_map_file(issn_map_file)
- def want(self, obj):
+ def want(self, raw_record: BeautifulSoup) -> bool:
return True
- def parse_record(self, a):
+ # TODO: mypy annotations partially skipped on this function ('Any' instead of
+ # 'BeautifulSoup') for now because XML parsing annotations are large and
+ # complex
+ def parse_record(self, a: Any) -> ReleaseEntity:
medline = a.MedlineCitation
# PubmedData isn't required by DTD, but seems to always be present
@@ -482,8 +489,8 @@ class PubmedImporter(EntityImporter):
pub_date = journal.PubDate
if not pub_date:
pub_date = journal.JournalIssue.PubDate
- release_date = None
- release_year = None
+ release_date: Optional[str] = None
+ release_year: Optional[int] = None
if pub_date.Year:
release_year = int(pub_date.Year.string)
if pub_date.find("Day") and pub_date.find("Month"):
@@ -578,7 +585,7 @@ class PubmedImporter(EntityImporter):
abstracts.append(abst)
other_abstracts = medline.find_all("OtherAbstract")
for other in other_abstracts:
- lang = "en"
+ lang: Optional[str] = "en"
if other.get("Language"):
lang = LANG_MAP_MARC.get(other["Language"])
abst = fatcat_openapi_client.ReleaseAbstract(
@@ -666,7 +673,7 @@ class PubmedImporter(EntityImporter):
# that there may be multiple ReferenceList (eg, sometimes one per
# Reference)
for ref in pubmed.find_all("Reference"):
- ref_extra = dict()
+ ref_extra: Dict[str, Any] = dict()
ref_doi = ref.find("ArticleId", IdType="doi")
if ref_doi:
ref_doi = clean_doi(ref_doi.string)
@@ -740,7 +747,7 @@ class PubmedImporter(EntityImporter):
)
return re
- def try_update(self, re):
+ def try_update(self, re: ReleaseEntity) -> bool:
# first, lookup existing by PMID (which must be defined)
existing = None
@@ -831,7 +838,7 @@ class PubmedImporter(EntityImporter):
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[ReleaseEntity]) -> None:
self.api.create_release_auto_batch(
fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
@@ -841,7 +848,7 @@ class PubmedImporter(EntityImporter):
)
)
- def parse_file(self, handle):
+ def parse_file(self, handle: Any) -> None:
# 1. open with beautiful soup
soup = BeautifulSoup(handle, "xml")
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
index 78eeec7a..520258cb 100644
--- a/python/fatcat_tools/importers/shadow.py
+++ b/python/fatcat_tools/importers/shadow.py
@@ -1,4 +1,7 @@
+from typing import Any, Dict, List, Optional
+
import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid
@@ -27,7 +30,7 @@ class ShadowLibraryImporter(EntityImporter):
- datetime
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -38,7 +41,7 @@ class ShadowLibraryImporter(EntityImporter):
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
- def want(self, raw_record):
+ def want(self, raw_record: Any) -> bool:
"""
Only want to import records with complete file-level metadata
"""
@@ -51,7 +54,7 @@ class ShadowLibraryImporter(EntityImporter):
return False
return True
- def parse_record(self, obj):
+ def parse_record(self, obj: Dict[str, Any]) -> Optional[FileEntity]:
"""
We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
"""
@@ -104,7 +107,7 @@ class ShadowLibraryImporter(EntityImporter):
urls.append(("webarchive", wayback))
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
- fe = fatcat_openapi_client.FileEntity(
+ fe = FileEntity(
md5=obj["file_meta"]["md5hex"],
sha1=obj["file_meta"]["sha1hex"],
sha256=obj["file_meta"]["sha256hex"],
@@ -116,7 +119,7 @@ class ShadowLibraryImporter(EntityImporter):
)
return fe
- def try_update(self, fe):
+ def try_update(self, fe: FileEntity) -> Optional[bool]:
# lookup sha1, or create new entity
existing = None
try:
@@ -189,7 +192,7 @@ class ShadowLibraryImporter(EntityImporter):
self.counts["update"] += 1
return False
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FileEntity]) -> None:
self.api.create_file_auto_batch(
fatcat_openapi_client.FileAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 22fefad3..f9ee29c9 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -12,12 +12,14 @@ import hashlib
import json
import subprocess
import sys
+from typing import Any, Dict, List, Optional, Tuple
import requests
from bs4 import BeautifulSoup
from fatcat_openapi_client import (
ApiClient,
Editgroup,
+ EntityEdit,
WebcaptureCdxLine,
WebcaptureEntity,
WebcaptureUrl,
@@ -30,7 +32,7 @@ GWB_URL_BASE = "https://web.archive.org/web"
REQ_SESSION = requests.Session()
-def parse_wbm_url(url):
+def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]:
"""Takes a wayback machine URL, and returns a tuple:
(timestamp, datetime, original_url)
@@ -42,7 +44,7 @@ def parse_wbm_url(url):
return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
-def test_parse_wbm_url():
+def test_parse_wbm_url() -> None:
u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
assert parse_wbm_url(u) == (
"20010712114837",
@@ -51,7 +53,7 @@ def test_parse_wbm_url():
)
-def parse_wbm_timestamp(timestamp):
+def parse_wbm_timestamp(timestamp: str) -> datetime.datetime:
"""
Takes a complete WBM timestamp string (like "20020327115625") and returns a
python datetime object (UTC)
@@ -71,18 +73,20 @@ def parse_wbm_timestamp(timestamp):
)
-def test_parse_wbm_timestamp():
+def test_parse_wbm_timestamp() -> None:
assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
-def fetch_wbm(url):
+def fetch_wbm(url: str) -> bytes:
resp = REQ_SESSION.get(url)
resp.raise_for_status()
assert resp.content
return resp.content
-def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
+def lookup_cdx(
+ embed_url: str, verify_hashes: bool = True, cdx_output: Any = None
+) -> Optional[WebcaptureCdxLine]:
sys.stderr.write(embed_url + "\n")
assert embed_url.startswith("/web/")
embed_url = embed_url.split("/")
@@ -132,7 +136,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
return None
-def wayback_url_to_relative(url):
+def wayback_url_to_relative(url: str) -> Optional[str]:
"""
Wayback URLs can be relative or absolute in rewritten documents. This
function converts any form of rewritten URL to a relative (to
@@ -149,7 +153,7 @@ def wayback_url_to_relative(url):
return None
-def extract_embeds(soup):
+def extract_embeds(soup: BeautifulSoup) -> List[str]:
embeds = set()
@@ -175,7 +179,7 @@ def extract_embeds(soup):
return list(embeds)
-def static_wayback_webcapture(wayback_url, cdx_output=None):
+def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity:
"""
Given a complete wayback machine capture URL, like:
@@ -214,7 +218,9 @@ def static_wayback_webcapture(wayback_url, cdx_output=None):
return wc
-def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
+def auto_wayback_static(
+ api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None
+) -> Tuple[Optional[str], Optional[EntityEdit]]:
"""
Returns a tuple: (editgroup_id, edit). If failed, both are None
"""
@@ -250,7 +256,7 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
return (editgroup_id, edit)
-def main():
+def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--verbose", action="store_true", help="verbose output")
parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")