From 13d56402cf5e01ca8e2306f85dbcc3b3f92a94d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 15 Oct 2021 17:13:39 -0700 Subject: improve fileset ingest integration with file ingest --- python/sandcrawler/misc.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'python/sandcrawler/misc.py') diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index dc46e9a..37a2a82 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -1,4 +1,5 @@ +import os import base64 import magic import hashlib @@ -261,3 +262,17 @@ def requests_retry_session(retries=10, backoff_factor=3, session.mount('https://', adapter) return session +def sanitize_fs_path(path: str) -> str: + """ + From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540 + """ + # - pretending to chroot to the current directory + # - cancelling all redundant paths (/.. = /) + # - making the path relative + return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/") + +def test_sanitize_fs_path() -> None: + assert sanitize_fs_path("/thing.png") == "thing.png" + assert sanitize_fs_path("../../thing.png") == "thing.png" + assert sanitize_fs_path("thing.png") == "thing.png" + assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png" -- cgit v1.2.3