diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/example.env | 1 | ||||
| -rw-r--r-- | python/sandcrawler/ingest_file.py | 12 | ||||
| -rw-r--r-- | python/sandcrawler/misc.py | 15 | ||||
| -rw-r--r-- | python/sandcrawler/persist.py | 2 | 
4 files changed, 25 insertions, 5 deletions
| diff --git a/python/example.env b/python/example.env index 5064c96..85af66c 100644 --- a/python/example.env +++ b/python/example.env @@ -5,3 +5,4 @@ IA_SECRET_KEY="dummy"  CDX_AUTH_TOKEN="dummy"  PETABOX_WEBDATA_SECRET="dummy"  SENTRY_DSN="" +SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/" diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index afaa329..72d4e14 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -43,7 +43,7 @@ class IngestFileWorker(SandcrawlerWorker):          check_existing_ingest(base_url) -> ingest_file_result or none          process_existing(result) -> response -            try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit() +            try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit()      Fetch resource: @@ -51,7 +51,7 @@ class IngestFileWorker(SandcrawlerWorker):      Process resource: -        process_hit(ResourceResult) -> response +        process_file_hit(ResourceResult) -> response          process_grobid(ResourceResult)      """ @@ -281,10 +281,12 @@ class IngestFileWorker(SandcrawlerWorker):          }          return result -    def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict: +    def process_file_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:          """          Run all the necessary processing for a new/fresh ingest hit.          """ +        if ingest_type in ["dataset-file", "component"] and file_meta['mimetype'] == "application/pdf": +            ingest_type = "pdf"          if ingest_type == "pdf":              return {                  'grobid': self.process_grobid(resource, file_meta), @@ -304,6 +306,8 @@ class IngestFileWorker(SandcrawlerWorker):              return {}          elif ingest_type == "component":              return {} +        elif ingest_type == "dataset-file": +            return {}          else:              raise NotImplementedError(f"process {ingest_type} hit") @@ -770,7 +774,7 @@ class IngestFileWorker(SandcrawlerWorker):          else:              raise NotImplementedError() -        info = self.process_hit(ingest_type, resource, file_meta) +        info = self.process_file_hit(ingest_type, resource, file_meta)          result.update(info)          # check if processing turned up an error diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index dc46e9a..37a2a82 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -1,4 +1,5 @@ +import os  import base64  import magic  import hashlib @@ -261,3 +262,17 @@ def requests_retry_session(retries=10, backoff_factor=3,      session.mount('https://', adapter)      return session +def sanitize_fs_path(path: str) -> str: +    """ +    From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540 +    """ +    # - pretending to chroot to the current directory +    # - cancelling all redundant paths (/.. = /) +    # - making the path relative +    return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/") + +def test_sanitize_fs_path() -> None: +    assert sanitize_fs_path("/thing.png") == "thing.png" +    assert sanitize_fs_path("../../thing.png") == "thing.png" +    assert sanitize_fs_path("thing.png") == "thing.png" +    assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png" diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index ee153ab..7fe59f1 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -189,7 +189,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):          results = [r for r in results if r]          requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')] -        requests = [r for r in requests if r] +        requests = [r for r in requests if r and r['ingest_type'] != 'dataset-file']          if requests:              resp = self.db.insert_ingest_request(self.cur, requests) | 
