diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 17:03:31 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 17:03:31 -0800 |
commit | c2eb4dad14c5b1d3566f39519065eb20eb7fd57f (patch) | |
tree | 3da3b7e5e1d5859c8b2d23c4f8640ce91672bdde /python | |
parent | cc536eaf01c3a58df292b5917d2f11b9cd8a3cf3 (diff) | |
download | sandcrawler-c2eb4dad14c5b1d3566f39519065eb20eb7fd57f.tar.gz sandcrawler-c2eb4dad14c5b1d3566f39519065eb20eb7fd57f.zip |
ingest: sketch out more of how 'existing' path would work
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest.py | 30 |
1 files changed, 22 insertions, 8 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index bb5f3fc..e5eb6e8 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -76,10 +76,12 @@ class IngestFileWorker(SandcrawlerWorker): """ if not self.try_existing_ingest: return None - raise NotImplementedError("can't pre-check ingests yet") - - # this "return True" is just here to make pylint happy - return True + existing = self.pgrest_client.get_ingest_file_result(base_url) + # TODO: filter on more flags? + if existing and existing['hit'] == True: + return existing + else: + return None def find_resource(self, url, best_mimetype=None): """ @@ -120,13 +122,25 @@ class IngestFileWorker(SandcrawlerWorker): If we have an existing ingest file result, do any database fetches or additional processing necessary to return a result. """ + raise NotImplementedError("process_existing() not tested or safe yet") + assert result_row['hit'] + existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex']) + existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex']) + if not (existing_file_meta and existing_grobid): + raise NotImplementedError("partially-exsiting records not implemented yet") + # TODO: CDX result = { - 'hit': result_row.hit, - 'status': result_row.status, + 'hit': result_row['hit'], + 'status': "existing", 'request': request, + 'grobid': existing_grobid, + 'file_meta': existing_file_meta, + 'terminal': { + 'terminal_url': result_row['terminal_url'], + 'terminal_dt': result_row['terminal_dt'], + 'terminal_status_code': result_row['terminal_status_code'], + }, } - # TODO: fetch file_meta - # TODO: fetch grobid return result def process_hit(self, resource, file_meta): |