aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_file.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-15 17:13:39 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7 (patch)
tree853a3dc60dcf3bd635be0816ff59b23f0975ae7d /python/sandcrawler/ingest_file.py
parenta09396caefe709b521e560add5b01c1a5c94cb53 (diff)
downloadsandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.tar.gz
sandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.zip
improve fileset ingest integration with file ingest
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r--python/sandcrawler/ingest_file.py12
1 files changed, 8 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index afaa329..72d4e14 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -43,7 +43,7 @@ class IngestFileWorker(SandcrawlerWorker):
check_existing_ingest(base_url) -> ingest_file_result or none
process_existing(result) -> response
- try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit()
+ try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit()
Fetch resource:
@@ -51,7 +51,7 @@ class IngestFileWorker(SandcrawlerWorker):
Process resource:
- process_hit(ResourceResult) -> response
+ process_file_hit(ResourceResult) -> response
process_grobid(ResourceResult)
"""
@@ -281,10 +281,12 @@ class IngestFileWorker(SandcrawlerWorker):
}
return result
- def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
+ def process_file_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
"""
Run all the necessary processing for a new/fresh ingest hit.
"""
+ if ingest_type in ["dataset-file", "component"] and file_meta['mimetype'] == "application/pdf":
+ ingest_type = "pdf"
if ingest_type == "pdf":
return {
'grobid': self.process_grobid(resource, file_meta),
@@ -304,6 +306,8 @@ class IngestFileWorker(SandcrawlerWorker):
return {}
elif ingest_type == "component":
return {}
+ elif ingest_type == "dataset-file":
+ return {}
else:
raise NotImplementedError(f"process {ingest_type} hit")
@@ -770,7 +774,7 @@ class IngestFileWorker(SandcrawlerWorker):
else:
raise NotImplementedError()
- info = self.process_hit(ingest_type, resource, file_meta)
+ info = self.process_file_hit(ingest_type, resource, file_meta)
result.update(info)
# check if processing turned up an error