aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-11-06 18:40:33 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-11-06 18:40:33 -0800
commitf32ff2bd5ab1dba1dc3108b75b28ce4090d9c00f (patch)
treea3aed7e881c437b1cdfd21598422706dd255ab97
parent013ee4d4ea51ce2c348ed051777fb2d0c18fe903 (diff)
downloadfatcat-f32ff2bd5ab1dba1dc3108b75b28ce4090d9c00f.tar.gz
fatcat-f32ff2bd5ab1dba1dc3108b75b28ce4090d9c00f.zip
html ingest: remaining implementation
-rw-r--r--python/fatcat_tools/importers/ingest.py41
1 files changed, 19 insertions, 22 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 2965f229..4dcb1ec3 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -221,7 +221,6 @@ class IngestFileResultImporter(EntityImporter):
def parse_record(self, row):
request = row['request']
- fatcat = request.get('fatcat')
file_meta = row['file_meta']
# double check that want() filtered request correctly (eg, old requests)
@@ -399,7 +398,6 @@ class IngestWebResultImporter(IngestFileResultImporter):
return True
-
def parse_record(self, row):
request = row['request']
@@ -442,9 +440,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
wc_cdx = []
# primary resource first
wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
- # XXX
- #surt=terminal['terminal_surt'], # XXX: from CDX?
- surt=terminal['terminal_url'],
+ surt=terminal_cdx['surt'],
timestamp=terminal['terminal_timestamp'],
url=terminal['terminal_url'],
mimetype=file_meta['mimetype'],
@@ -463,7 +459,7 @@ class IngestWebResultImporter(IngestFileResultImporter):
timestamp=timestamp,
url=resource['url'],
mimetype=resource.get('mimetype'),
- size=resource.get('size_bytes'),
+ size=resource.get('size'),
sha1=resource.get('sha1hex'),
sha256=resource.get('sha256hex'),
))
@@ -482,7 +478,6 @@ class IngestWebResultImporter(IngestFileResultImporter):
wc.edit_extra = edit_extra
return wc
-
def try_update(self, wc):
# check for existing edits-in-progress with same file hash
@@ -491,23 +486,25 @@ class IngestWebResultImporter(IngestFileResultImporter):
self.counts['skip-in-queue'] += 1
return False
- # lookup sha1, or create new entity
- existing = None
- # XXX: lookup *release* instead; skip if any existing web capture entities
- # XXX: only one release per webcapture
- try:
- existing = self.api.lookup_file(sha1=wc.sha1)
- except fatcat_openapi_client.rest.ApiException as err:
- if err.status != 404:
- raise err
-
- if not existing:
- return True
- else:
- # TODO: for now, never update
- self.counts['skip-update-disabled'] += 1
+ # lookup sha1, or create new entity (TODO: API doesn't support this yet)
+ #existing = None
+
+ # TODO: currently only allow one release per webcapture
+ release = self.api.get_release(wc.release_ids[0], expand="webcaptures")
+ if release.webcaptures:
+ # check if this is an existing match, or just a similar hit
+ for other in release.webcaptures:
+ if wc.original_url == other.original_url:
+ # TODO: compare very similar timestamps of same time (different formats)
+ self.counts['exists'] += 1
+ return False
+ self.counts['skip-release-has-webcapture'] += 1
return False
+ # TODO: for now, never update
+ self.counts['skip-update-disabled'] += 1
+ return False
+
def insert_batch(self, batch):
self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(