improve fileset ingest integration with file ingest

author: Bryan Newbold <bnewbold@archive.org> 2021-10-15 17:13:39 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-15 18:15:29 -0700
commit: 13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7 (patch)
tree: 853a3dc60dcf3bd635be0816ff59b23f0975ae7d
parent: a09396caefe709b521e560add5b01c1a5c94cb53 (diff)
download: sandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.tar.gz
sandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.zip
4 files changed, 25 insertions, 5 deletions
diff --git a/python/example.env b/python/example.env
index 5064c96..85af66c 100644
--- a/python/example.env
+++ b/python/example.env
@@ -5,3 +5,4 @@ IA_SECRET_KEY="dummy"
 CDX_AUTH_TOKEN="dummy"
 PETABOX_WEBDATA_SECRET="dummy"
 SENTRY_DSN=""
+SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/"
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index afaa329..72d4e14 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -43,7 +43,7 @@ class IngestFileWorker(SandcrawlerWorker):
 
         check_existing_ingest(base_url) -> ingest_file_result or none
         process_existing(result) -> response
-            try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit()
+            try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit()
 
     Fetch resource:
 
@@ -51,7 +51,7 @@ class IngestFileWorker(SandcrawlerWorker):
 
     Process resource:
 
-        process_hit(ResourceResult) -> response
+        process_file_hit(ResourceResult) -> response
         process_grobid(ResourceResult)
     """
 
@@ -281,10 +281,12 @@ class IngestFileWorker(SandcrawlerWorker):
         }
         return result
 
-    def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
+    def process_file_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
         """
         Run all the necessary processing for a new/fresh ingest hit.
         """
+        if ingest_type in ["dataset-file", "component"] and file_meta['mimetype'] == "application/pdf":
+            ingest_type = "pdf"
         if ingest_type == "pdf":
             return {
                 'grobid': self.process_grobid(resource, file_meta),
@@ -304,6 +306,8 @@ class IngestFileWorker(SandcrawlerWorker):
             return {}
         elif ingest_type == "component":
             return {}
+        elif ingest_type == "dataset-file":
+            return {}
         else:
             raise NotImplementedError(f"process {ingest_type} hit")
 
@@ -770,7 +774,7 @@ class IngestFileWorker(SandcrawlerWorker):
         else:
             raise NotImplementedError()
 
-        info = self.process_hit(ingest_type, resource, file_meta)
+        info = self.process_file_hit(ingest_type, resource, file_meta)
         result.update(info)
 
         # check if processing turned up an error
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index dc46e9a..37a2a82 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,4 +1,5 @@
 
+import os
 import base64
 import magic
 import hashlib
@@ -261,3 +262,17 @@ def requests_retry_session(retries=10, backoff_factor=3,
     session.mount('https://', adapter)
     return session
 
+def sanitize_fs_path(path: str) -> str:
+    """
+    From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
+    """
+    # - pretending to chroot to the current directory
+    # - cancelling all redundant paths (/.. = /)
+    # - making the path relative
+    return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
+
+def test_sanitize_fs_path() -> None:
+    assert sanitize_fs_path("/thing.png") == "thing.png"
+    assert sanitize_fs_path("../../thing.png") == "thing.png"
+    assert sanitize_fs_path("thing.png") == "thing.png"
+    assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png"
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index ee153ab..7fe59f1 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -189,7 +189,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
         results = [r for r in results if r]
 
         requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')]
-        requests = [r for r in requests if r]
+        requests = [r for r in requests if r and r['ingest_type'] != 'dataset-file']
 
         if requests:
             resp = self.db.insert_ingest_request(self.cur, requests)
author	Bryan Newbold <bnewbold@archive.org>	2021-10-15 17:13:39 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-15 18:15:29 -0700
commit	13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7 (patch)
tree	853a3dc60dcf3bd635be0816ff59b23f0975ae7d
parent	a09396caefe709b521e560add5b01c1a5c94cb53 (diff)
download	sandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.tar.gz sandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.zip