diff options
-rw-r--r-- | python/example.env | 1 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 12 | ||||
-rw-r--r-- | python/sandcrawler/misc.py | 15 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 2 |
4 files changed, 25 insertions, 5 deletions
diff --git a/python/example.env b/python/example.env index 5064c96..85af66c 100644 --- a/python/example.env +++ b/python/example.env @@ -5,3 +5,4 @@ IA_SECRET_KEY="dummy" CDX_AUTH_TOKEN="dummy" PETABOX_WEBDATA_SECRET="dummy" SENTRY_DSN="" +SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/" diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index afaa329..72d4e14 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -43,7 +43,7 @@ class IngestFileWorker(SandcrawlerWorker): check_existing_ingest(base_url) -> ingest_file_result or none process_existing(result) -> response - try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit() + try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit() Fetch resource: @@ -51,7 +51,7 @@ class IngestFileWorker(SandcrawlerWorker): Process resource: - process_hit(ResourceResult) -> response + process_file_hit(ResourceResult) -> response process_grobid(ResourceResult) """ @@ -281,10 +281,12 @@ class IngestFileWorker(SandcrawlerWorker): } return result - def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict: + def process_file_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict: """ Run all the necessary processing for a new/fresh ingest hit. """ + if ingest_type in ["dataset-file", "component"] and file_meta['mimetype'] == "application/pdf": + ingest_type = "pdf" if ingest_type == "pdf": return { 'grobid': self.process_grobid(resource, file_meta), @@ -304,6 +306,8 @@ class IngestFileWorker(SandcrawlerWorker): return {} elif ingest_type == "component": return {} + elif ingest_type == "dataset-file": + return {} else: raise NotImplementedError(f"process {ingest_type} hit") @@ -770,7 +774,7 @@ class IngestFileWorker(SandcrawlerWorker): else: raise NotImplementedError() - info = self.process_hit(ingest_type, resource, file_meta) + info = self.process_file_hit(ingest_type, resource, file_meta) result.update(info) # check if processing turned up an error diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index dc46e9a..37a2a82 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -1,4 +1,5 @@ +import os import base64 import magic import hashlib @@ -261,3 +262,17 @@ def requests_retry_session(retries=10, backoff_factor=3, session.mount('https://', adapter) return session +def sanitize_fs_path(path: str) -> str: + """ + From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540 + """ + # - pretending to chroot to the current directory + # - cancelling all redundant paths (/.. = /) + # - making the path relative + return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/") + +def test_sanitize_fs_path() -> None: + assert sanitize_fs_path("/thing.png") == "thing.png" + assert sanitize_fs_path("../../thing.png") == "thing.png" + assert sanitize_fs_path("thing.png") == "thing.png" + assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png" diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index ee153ab..7fe59f1 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -189,7 +189,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): results = [r for r in results if r] requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')] - requests = [r for r in requests if r] + requests = [r for r in requests if r and r['ingest_type'] != 'dataset-file'] if requests: resp = self.db.insert_ingest_request(self.cur, requests) |