aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-10 15:33:46 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-10 15:33:46 -0800
commit357b1b07d071cce7d4cd2289ee3965018c89646c (patch)
tree600c80ace5302bea1de80e15b8b76d7e48edda51
parent5ce01ab8ca992023c320294f20356735d550d4d9 (diff)
downloadsandcrawler-357b1b07d071cce7d4cd2289ee3965018c89646c.tar.gz
sandcrawler-357b1b07d071cce7d4cd2289ee3965018c89646c.zip
ingest: start re-processing GROBID with newer version
-rw-r--r--python/sandcrawler/ingest_file.py8
1 files changed, 6 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 4a5abbe..b6a5115 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -361,8 +361,12 @@ class IngestFileWorker(SandcrawlerWorker):
if self.try_existing_grobid:
existing = self.pgrest_client.get_grobid(file_meta["sha1hex"])
if existing:
- print("found existing GROBID result", file=sys.stderr)
- return existing
+ # grobid_timestamp = existing.get("grobid_timestamp") or None
+ # status
+ grobid_version = existing.get("grobid_version") or None
+ if grobid_version and grobid_version.startswith("0.7"):
+ print("found existing GROBID result", file=sys.stderr)
+ return existing
# Need to actually processes
result = self.grobid_client.process_fulltext(resource.body)