aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-14 17:03:31 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-14 17:03:31 -0800
commitc2eb4dad14c5b1d3566f39519065eb20eb7fd57f (patch)
tree3da3b7e5e1d5859c8b2d23c4f8640ce91672bdde
parentcc536eaf01c3a58df292b5917d2f11b9cd8a3cf3 (diff)
downloadsandcrawler-c2eb4dad14c5b1d3566f39519065eb20eb7fd57f.tar.gz
sandcrawler-c2eb4dad14c5b1d3566f39519065eb20eb7fd57f.zip
ingest: sketch out more of how 'existing' path would work
-rw-r--r--python/sandcrawler/ingest.py30
1 files changed, 22 insertions, 8 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index bb5f3fc..e5eb6e8 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -76,10 +76,12 @@ class IngestFileWorker(SandcrawlerWorker):
"""
if not self.try_existing_ingest:
return None
- raise NotImplementedError("can't pre-check ingests yet")
-
- # this "return True" is just here to make pylint happy
- return True
+ existing = self.pgrest_client.get_ingest_file_result(base_url)
+ # TODO: filter on more flags?
+ if existing and existing['hit'] == True:
+ return existing
+ else:
+ return None
def find_resource(self, url, best_mimetype=None):
"""
@@ -120,13 +122,25 @@ class IngestFileWorker(SandcrawlerWorker):
If we have an existing ingest file result, do any database fetches or
additional processing necessary to return a result.
"""
+ raise NotImplementedError("process_existing() not tested or safe yet")
+ assert result_row['hit']
+ existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
+ existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
+ if not (existing_file_meta and existing_grobid):
+ raise NotImplementedError("partially-exsiting records not implemented yet")
+ # TODO: CDX
result = {
- 'hit': result_row.hit,
- 'status': result_row.status,
+ 'hit': result_row['hit'],
+ 'status': "existing",
'request': request,
+ 'grobid': existing_grobid,
+ 'file_meta': existing_file_meta,
+ 'terminal': {
+ 'terminal_url': result_row['terminal_url'],
+ 'terminal_dt': result_row['terminal_dt'],
+ 'terminal_status_code': result_row['terminal_status_code'],
+ },
}
- # TODO: fetch file_meta
- # TODO: fetch grobid
return result
def process_hit(self, resource, file_meta):