aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-03-23 17:55:54 -0700
committerBryan Newbold <bnewbold@robocracy.org>2022-03-23 18:38:56 -0700
commit515b5ecc6e75aae834958d74883426230532f10d (patch)
tree1d5e00696bf5272e7bda7c4ff275d9e8acaaa479 /python/fatcat_tools
parentea6ccd227e0f62f5f9e7a66ba8bc90b18a2ca097 (diff)
downloadfatcat-515b5ecc6e75aae834958d74883426230532f10d.tar.gz
fatcat-515b5ecc6e75aae834958d74883426230532f10d.zip
ingest fileset fixes, and some test coverage
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/common.py11
-rw-r--r--python/fatcat_tools/importers/ingest.py32
2 files changed, 30 insertions, 13 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index cd51a24c..475cb97a 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -916,3 +916,14 @@ def make_kafka_consumer(
)
print("Consuming from kafka topic {}, group {}".format(topic_name, group))
return consumer
+
+
+def filesets_very_similar(a: FilesetEntity, b: FilesetEntity) -> bool:
+ """
+ This helper method checks if two Fileset entities are effectively equivalent: same set of files with comparable hashes.
+
+ Uses a set() of SHA1 hashes to test for equivalence.
+ """
+ a_hashes = set([f.sha1 for f in a.manifest])
+ b_hashes = set([f.sha1 for f in b.manifest])
+ return a == b
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 62e0e854..c8d04d6f 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -11,7 +11,7 @@ from fatcat_openapi_client import (
WebcaptureEntity,
)
-from .common import EntityImporter, make_rel_url
+from .common import EntityImporter, filesets_very_similar, make_rel_url
class IngestFileResultImporter(EntityImporter):
@@ -693,9 +693,6 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
rel="webarchive-base",
)
)
- # TODO: repository-base
- # TODO: web-base
-
if strategy == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"):
urls.append(
fatcat_openapi_client.FilesetUrl(
@@ -727,6 +724,15 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
rel="repository-base",
)
)
+ elif row.get("terminal"):
+ # fallback generic web URL
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=row["terminal"]["terminal_url"],
+ rel="web",
+ )
+ )
+
return urls
def parse_record(self, row: Dict[str, Any]) -> FilesetEntity:
@@ -781,7 +787,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
self.counts["skip-partial-file-info"] += 1
return None
if ingest_file.get("platform_url"):
- # XXX: should we include this?
+ # TODO: should we include this?
fsf.extra["original_url"] = ingest_file["platform_url"]
if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"):
fsf.extra[
@@ -805,25 +811,25 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
# check for existing edits-in-progress with same URL
for other in self._entity_queue:
- # XXX: how to duplicate check?
- if other.original_url == fse.original_url:
+ if filesets_very_similar(other, fse):
self.counts["skip-in-queue"] += 1
+ self.counts["skip"] += 1
return False
# lookup sha1, or create new entity (TODO: API doesn't support this yet)
# existing = None
# NOTE: in lieu of existing checks (by lookup), only allow one fileset per release
- release = self.api.get_release(fse.release_ids[0], expand="filesets")
- if release.filesets:
- # XXX: how to duplicate check filesets?
+ if not self.bezerk_mode:
+ release = self.api.get_release(fse.release_ids[0], expand="filesets")
# check if this is an existing match, or just a similar hit
- for other in release.filesets:
- if fse.original_url == other.original_url:
- # TODO: compare very similar timestamps of same time (different formats)
+ for other in release.filesets or []:
+ if filesets_very_similar(other, fse):
self.counts["exists"] += 1
return False
+ # for now, being conservative and just skipping if release has any other fileset
self.counts["skip-release-has-fileset"] += 1
+ self.counts["skip"] += 1
return False
return True