diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 17:13:39 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:29 -0700 |
commit | 13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7 (patch) | |
tree | 853a3dc60dcf3bd635be0816ff59b23f0975ae7d /python/sandcrawler/ingest_file.py | |
parent | a09396caefe709b521e560add5b01c1a5c94cb53 (diff) | |
download | sandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.tar.gz sandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.zip |
improve fileset ingest integration with file ingest
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 12 |
1 files changed, 8 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index afaa329..72d4e14 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -43,7 +43,7 @@ class IngestFileWorker(SandcrawlerWorker): check_existing_ingest(base_url) -> ingest_file_result or none process_existing(result) -> response - try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit() + try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit() Fetch resource: @@ -51,7 +51,7 @@ class IngestFileWorker(SandcrawlerWorker): Process resource: - process_hit(ResourceResult) -> response + process_file_hit(ResourceResult) -> response process_grobid(ResourceResult) """ @@ -281,10 +281,12 @@ class IngestFileWorker(SandcrawlerWorker): } return result - def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict: + def process_file_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict: """ Run all the necessary processing for a new/fresh ingest hit. """ + if ingest_type in ["dataset-file", "component"] and file_meta['mimetype'] == "application/pdf": + ingest_type = "pdf" if ingest_type == "pdf": return { 'grobid': self.process_grobid(resource, file_meta), @@ -304,6 +306,8 @@ class IngestFileWorker(SandcrawlerWorker): return {} elif ingest_type == "component": return {} + elif ingest_type == "dataset-file": + return {} else: raise NotImplementedError(f"process {ingest_type} hit") @@ -770,7 +774,7 @@ class IngestFileWorker(SandcrawlerWorker): else: raise NotImplementedError() - info = self.process_hit(ingest_type, resource, file_meta) + info = self.process_file_hit(ingest_type, resource, file_meta) result.update(info) # check if processing turned up an error |