aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/misc.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-15 17:13:39 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7 (patch)
tree853a3dc60dcf3bd635be0816ff59b23f0975ae7d /python/sandcrawler/misc.py
parenta09396caefe709b521e560add5b01c1a5c94cb53 (diff)
downloadsandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.tar.gz
sandcrawler-13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7.zip
improve fileset ingest integration with file ingest
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r--python/sandcrawler/misc.py15
1 files changed, 15 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index dc46e9a..37a2a82 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,4 +1,5 @@
+import os
import base64
import magic
import hashlib
@@ -261,3 +262,17 @@ def requests_retry_session(retries=10, backoff_factor=3,
session.mount('https://', adapter)
return session
+def sanitize_fs_path(path: str) -> str:
+ """
+ From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
+ """
+ # - pretending to chroot to the current directory
+ # - cancelling all redundant paths (/.. = /)
+ # - making the path relative
+ return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
+
+def test_sanitize_fs_path() -> None:
+ assert sanitize_fs_path("/thing.png") == "thing.png"
+ assert sanitize_fs_path("../../thing.png") == "thing.png"
+ assert sanitize_fs_path("thing.png") == "thing.png"
+ assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png"