aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-03 12:57:32 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-03 16:46:07 -0700
commitcaf1cb316ed18820f3239a285ef14bf45ef963a2 (patch)
tree2d3713773dac769878154f61c2eb9f7804f1a60c /python/fatcat_tools/importers/ingest.py
parent10a2374051568edf3d872988e730328d899a0fdd (diff)
downloadfatcat-caf1cb316ed18820f3239a285ef14bf45ef963a2.tar.gz
fatcat-caf1cb316ed18820f3239a285ef14bf45ef963a2.zip
typing: initial annotations on importers
This commit just adds the type annotations, doesn't do fixes to code to make type checking pass.
Diffstat (limited to 'python/fatcat_tools/importers/ingest.py')
-rw-r--r--python/fatcat_tools/importers/ingest.py81
1 files changed, 46 insertions, 35 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index e0a6c3f5..e13ce4bd 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -1,12 +1,23 @@
import datetime
+from typing import Any, Dict, List, Optional
import fatcat_openapi_client
+from fatcat_openapi_client import (
+ ApiClient,
+ FileEntity,
+ FilesetEntity,
+ FilesetUrl,
+ FileUrl,
+ WebcaptureEntity,
+)
from .common import EntityImporter, make_rel_url
class IngestFileResultImporter(EntityImporter):
- def __init__(self, api, require_grobid=True, **kwargs):
+ def __init__(
+ self, api: fatcat_openapi_client.ApiClient, require_grobid: bool = True, **kwargs
+ ) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -41,7 +52,7 @@ class IngestFileResultImporter(EntityImporter):
if kwargs.get("skip_source_allowlist", False):
self.ingest_request_source_allowlist = []
- def want_file(self, row) -> bool:
+ def want_file(self, row: Dict[str, Any]) -> bool:
"""
File-specific part of want(). Generic across general ingest and save-paper-now.
"""
@@ -76,7 +87,7 @@ class IngestFileResultImporter(EntityImporter):
return True
- def want_ingest(self, row) -> bool:
+ def want_ingest(self, row: Dict[str, Any]) -> bool:
"""
Sandcrawler ingest-specific part of want(). Generic across file and
webcapture ingest.
@@ -115,7 +126,7 @@ class IngestFileResultImporter(EntityImporter):
return True
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
"""
Overall logic here probably needs work (TODO):
@@ -137,7 +148,7 @@ class IngestFileResultImporter(EntityImporter):
return True
- def parse_ingest_release_ident(self, row):
+ def parse_ingest_release_ident(self, row: Dict[str, Any]) -> Optional[str]:
request = row["request"]
fatcat = request.get("fatcat")
@@ -178,7 +189,7 @@ class IngestFileResultImporter(EntityImporter):
return release_ident
- def parse_terminal(self, row):
+ def parse_terminal(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
terminal = row.get("terminal")
if not terminal:
# support old cdx-only ingest results
@@ -206,7 +217,7 @@ class IngestFileResultImporter(EntityImporter):
)
return terminal
- def parse_urls(self, row, terminal):
+ def parse_urls(self, row: Dict[str, Any], terminal: Dict[str, Any]) -> List[FileUrl]:
request = row["request"]
@@ -224,10 +235,10 @@ class IngestFileResultImporter(EntityImporter):
)
urls = [url, ("webarchive", wayback)]
- urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+ urls = [FileUrl(rel=rel, url=url) for (rel, url) in urls]
return urls
- def parse_edit_extra(self, row):
+ def parse_edit_extra(self, row: Dict[str, Any]) -> Dict[str, Any]:
request = row["request"]
edit_extra = dict()
@@ -251,7 +262,7 @@ class IngestFileResultImporter(EntityImporter):
return edit_extra
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> FileEntity:
request = row["request"]
file_meta = row["file_meta"]
@@ -283,7 +294,7 @@ class IngestFileResultImporter(EntityImporter):
urls = self.parse_urls(row, terminal)
- fe = fatcat_openapi_client.FileEntity(
+ fe = FileEntity(
md5=file_meta["md5hex"],
sha1=file_meta["sha1hex"],
sha256=file_meta["sha256hex"],
@@ -298,7 +309,7 @@ class IngestFileResultImporter(EntityImporter):
fe.edit_extra = edit_extra
return fe
- def try_update(self, fe):
+ def try_update(self, fe: FileEntity) -> bool:
# lookup sha1, or create new entity
existing = None
try:
@@ -330,7 +341,7 @@ class IngestFileResultImporter(EntityImporter):
self.counts["skip-update-disabled"] += 1
return False
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FileEntity]) -> None:
if self.submit_mode:
eg = self.api.create_editgroup(
fatcat_openapi_client.Editgroup(
@@ -358,7 +369,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
them for further human review (as opposed to accepting by default).
"""
- def __init__(self, api, submit_mode=True, **kwargs):
+ def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -371,7 +382,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
kwargs["do_updates"] = False
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
source = row["request"].get("ingest_request_source")
if not source:
@@ -397,7 +408,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
into webcapture objects.
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -408,7 +419,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
kwargs["do_updates"] = False
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
if not self.want_ingest(row):
return False
@@ -426,7 +437,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
return True
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> Optional[WebcaptureEntity]:
request = row["request"]
file_meta = row["file_meta"]
@@ -512,7 +523,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
wc.edit_extra = edit_extra
return wc
- def try_update(self, wc):
+ def try_update(self, wc: WebcaptureEntity) -> bool:
# check for existing edits-in-progress with same URL
for other in self._entity_queue:
@@ -539,7 +550,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
# so go ahead and insert!
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[WebcaptureEntity]) -> None:
if self.submit_mode:
eg = self.api.create_editgroup(
fatcat_openapi_client.Editgroup(
@@ -565,7 +576,7 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
Like SavePaperNowFileImporter, but for webcapture (HTML) ingest.
"""
- def __init__(self, api, submit_mode=True, **kwargs):
+ def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -577,7 +588,7 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
kwargs["do_updates"] = False
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
"""
Relatively custom want() here, a synthesis of other filters.
@@ -617,7 +628,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
results into fileset objects.
"""
- def __init__(self, api, **kwargs):
+ def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -629,7 +640,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.max_file_count = 300
- def want_fileset(self, row):
+ def want_fileset(self, row: Dict[str, Any]) -> bool:
if not row.get("manifest") or len(row.get("manifest")) == 0:
self.counts["skip-empty-manifest"] += 1
@@ -645,7 +656,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
if not self.want_ingest(row):
return False
@@ -662,7 +673,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
- def parse_fileset_urls(self, row):
+ def parse_fileset_urls(self, row: Dict[str, Any]) -> List[FilesetUrl]:
if not row.get("strategy"):
return []
strategy = row["strategy"]
@@ -717,7 +728,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
)
return urls
- def parse_record(self, row):
+ def parse_record(self, row: Dict[str, Any]) -> FilesetEntity:
request = row["request"]
@@ -735,7 +746,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
self.counts["skip-release-not-found"] += 1
return None
- entity_extra = dict()
+ entity_extra: Dict[str, Any] = dict()
edit_extra = self.parse_edit_extra(row)
edit_extra["ingest_strategy"] = row["ingest_strategy"]
if row.get("platform"):
@@ -789,12 +800,12 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
fe.edit_extra = edit_extra
return fe
- def try_update(self, wc):
+ def try_update(self, fse: FilesetEntity) -> bool:
# check for existing edits-in-progress with same URL
for other in self._entity_queue:
# XXX: how to duplicate check?
- if other.original_url == wc.original_url:
+ if other.original_url == fse.original_url:
self.counts["skip-in-queue"] += 1
return False
@@ -802,12 +813,12 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
# existing = None
# NOTE: in lieu of existing checks (by lookup), only allow one fileset per release
- release = self.api.get_release(wc.release_ids[0], expand="filesets")
+ release = self.api.get_release(fse.release_ids[0], expand="filesets")
if release.filesets:
# XXX: how to duplicate check filesets?
# check if this is an existing match, or just a similar hit
for other in release.filesets:
- if wc.original_url == other.original_url:
+ if fse.original_url == other.original_url:
# TODO: compare very similar timestamps of same time (different formats)
self.counts["exists"] += 1
return False
@@ -816,7 +827,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
- def insert_batch(self, batch):
+ def insert_batch(self, batch: List[FilesetEntity]) -> None:
if self.submit_mode:
eg = self.api.create_editgroup(
fatcat_openapi_client.Editgroup(
@@ -842,7 +853,7 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
Like SavePaperNowFileImporter, but for fileset/dataset ingest.
"""
- def __init__(self, api, submit_mode=True, **kwargs):
+ def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None:
eg_desc = (
kwargs.pop("editgroup_description", None)
@@ -854,7 +865,7 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
kwargs["do_updates"] = False
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
- def want(self, row):
+ def want(self, row: Dict[str, Any]) -> bool:
source = row["request"].get("ingest_request_source")
if not source: