diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-10 15:33:46 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-10 15:33:46 -0800 |
commit | 357b1b07d071cce7d4cd2289ee3965018c89646c (patch) | |
tree | 600c80ace5302bea1de80e15b8b76d7e48edda51 /python | |
parent | 5ce01ab8ca992023c320294f20356735d550d4d9 (diff) | |
download | sandcrawler-357b1b07d071cce7d4cd2289ee3965018c89646c.tar.gz sandcrawler-357b1b07d071cce7d4cd2289ee3965018c89646c.zip |
ingest: start re-processing GROBID with newer version
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 8 |
1 files changed, 6 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 4a5abbe..b6a5115 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -361,8 +361,12 @@ class IngestFileWorker(SandcrawlerWorker): if self.try_existing_grobid: existing = self.pgrest_client.get_grobid(file_meta["sha1hex"]) if existing: - print("found existing GROBID result", file=sys.stderr) - return existing + # grobid_timestamp = existing.get("grobid_timestamp") or None + # status + grobid_version = existing.get("grobid_version") or None + if grobid_version and grobid_version.startswith("0.7"): + print("found existing GROBID result", file=sys.stderr) + return existing # Need to actually processes result = self.grobid_client.process_fulltext(resource.body) |