1 files changed, 150 insertions, 55 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index a3e2960..4e37036 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,39 +1,50 @@
-
 import base64
-import magic
-import hashlib
 import datetime
-from typing import Optional
+import hashlib
+import os
+from typing import List, Optional
 
+import magic
 import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
 import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error
 
 
 def clean_url(s: str) -> str:
     s = s.strip()
     parsed = urlcanon.parse_url(s)
     if not parsed.port and parsed.colon_before_port:
-        parsed.colon_before_port = b''
+        parsed.colon_before_port = b""
     return str(urlcanon.whatwg(parsed))
 
+
 def url_fuzzy_equal(left: str, right: str) -> bool:
     """
     TODO: use proper surt library and canonicalization for this check
     """
-    fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
-    fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
+    fuzzy_left = "://".join(
+        clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:]
+    )
+    fuzzy_right = "://".join(
+        clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:]
+    )
     if fuzzy_left == fuzzy_right:
         return True
     elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
         return True
     return False
 
+
 def test_url_fuzzy_equal() -> None:
-    assert True == url_fuzzy_equal(
-        "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
-        "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree")
+    assert (
+        url_fuzzy_equal(
+            "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+            "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+        )
+        is True
+    )
+
 
 def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
     """
@@ -44,12 +55,15 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
     assert blob is not None
     if not allow_empty:
         assert blob
-    mimetype = magic.Magic(mime=True).from_buffer(blob)
+    if len(blob) < 1024 * 1024:
+        mimetype = magic.Magic(mime=True).from_buffer(blob)
+    else:
+        mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)])
     if mimetype in ("application/xml", "text/xml"):
         # crude checks for XHTML or JATS XML, using only first 1 kB of file
         if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
             mimetype = "application/xhtml+xml"
-        elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+        elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
             mimetype = "application/jats+xml"
     hashes = [
         hashlib.sha1(),
@@ -66,6 +80,49 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
         mimetype=mimetype,
     )
 
+
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+    """
+    Variant of gen_file_metadata() which works with files on local disk
+    """
+    assert path is not None
+    mimetype = magic.Magic(mime=True).from_file(path)
+    if mimetype in ("application/xml", "text/xml"):
+        with open(path, "rb") as f:
+            blob = f.read(1024)
+            # crude checks for XHTML or JATS XML, using only first 1 kB of file
+            if (
+                b"<htm" in blob[:1024]
+                and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]
+            ):
+                mimetype = "application/xhtml+xml"
+            elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+                mimetype = "application/jats+xml"
+    hashes = [
+        hashlib.sha1(),
+        hashlib.sha256(),
+        hashlib.md5(),
+    ]
+    size_bytes = 0
+    with open(path, "rb") as f:
+        while True:
+            chunk = f.read(1024 * 1024)
+            if not chunk:
+                break
+            size_bytes += len(chunk)
+            for h in hashes:
+                h.update(chunk)
+    if not allow_empty:
+        assert size_bytes > 0
+    return dict(
+        size_bytes=size_bytes,
+        sha1hex=hashes[0].hexdigest(),
+        sha256hex=hashes[1].hexdigest(),
+        md5hex=hashes[2].hexdigest(),
+        mimetype=mimetype,
+    )
+
+
 def b32_hex(s: str) -> str:
     """
     Converts a base32-encoded SHA-1 checksum into hex-encoded
@@ -79,16 +136,18 @@ def b32_hex(s: str) -> str:
         if len(s) == 40:
             return s
         raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
 
 NORMAL_MIME = (
-    'application/pdf',
-    'application/postscript',
-    'text/html',
-    'text/xml',
-    'application/octet-stream',
+    "application/pdf",
+    "application/postscript",
+    "text/html",
+    "text/xml",
+    "application/octet-stream",
 )
 
+
 def normalize_mime(raw: str) -> Optional[str]:
     raw = raw.lower().strip()
     for norm in NORMAL_MIME:
@@ -96,28 +155,26 @@ def normalize_mime(raw: str) -> Optional[str]:
             return norm
 
     # Special cases
-    if raw.startswith('application/xml'):
-        return 'text/xml'
-    if raw.startswith('application/x-pdf'):
-        return 'application/pdf'
+    if raw.startswith("application/xml"):
+        return "text/xml"
+    if raw.startswith("application/x-pdf"):
+        return "application/pdf"
+    if raw in (".pdf",):
+        return "application/pdf"
     if raw in (
-            '.pdf',
-            ):
-        return 'application/pdf'
-    if raw in (
-            'application/download',
-            'binary/octet-stream',
-            'unk',
-            'application/x-download',
-            'application/octetstream',
-            'application/force-download',
-            'application/unknown',
-            ):
-        return 'application/octet-stream'
+        "application/download",
+        "binary/octet-stream",
+        "unk",
+        "application/x-download",
+        "application/octetstream",
+        "application/force-download",
+        "application/unknown",
+    ):
+        return "application/octet-stream"
     return None
 
 
-def test_normalize_mime():
+def test_normalize_mime() -> None:
     assert normalize_mime("asdf") is None
     assert normalize_mime("application/pdf") == "application/pdf"
     assert normalize_mime("application/pdf+journal") == "application/pdf"
@@ -130,7 +187,7 @@ def test_normalize_mime():
     assert normalize_mime("binary/octet-stream") == "application/octet-stream"
 
 
-def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
+def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
     """
     This method always filters a few things out:
 
@@ -151,14 +208,19 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
     offset = cdx[9]
     warc = cdx[10]
 
-    if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
-            and len(sha1b32) == 32 and dt.isdigit()):
+    if not (
+        sha1b32.isalnum()
+        and c_size.isdigit()
+        and offset.isdigit()
+        and len(sha1b32) == 32
+        and dt.isdigit()
+    ):
         return None
 
-    if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+    if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
         return None
 
-    if mime is None or mime == '-':
+    if mime is None or mime == "-":
         mime = "application/octet-stream"
 
     if normalize:
@@ -179,6 +241,7 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
         warc_path=warc,
     )
 
+
 def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
     if not dt_str:
         return None
@@ -187,23 +250,39 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
     except Exception:
         return None
 
+
 def test_parse_cdx_datetime() -> None:
-    assert parse_cdx_datetime("") == None
-    assert parse_cdx_datetime("asdf") == None
-    assert parse_cdx_datetime("19930203123045") != None
-    assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+    assert parse_cdx_datetime("") is None
+    assert parse_cdx_datetime("asdf") is None
+    assert parse_cdx_datetime("19930203123045") is not None
+    assert parse_cdx_datetime("20201028235103") == datetime.datetime(
+        year=2020, month=10, day=28, hour=23, minute=51, second=3
+    )
+
 
 def datetime_to_cdx(dt: datetime.datetime) -> str:
-    return '%04d%02d%02d%02d%02d%02d' % (
-        dt.year, dt.month, dt.day,
-        dt.hour, dt.minute, dt.second,
+    return "%04d%02d%02d%02d%02d%02d" % (
+        dt.year,
+        dt.month,
+        dt.day,
+        dt.hour,
+        dt.minute,
+        dt.second,
     )
 
+
 def test_datetime_to_cdx() -> None:
-    assert "20201028235103" == datetime_to_cdx(datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
+    assert "20201028235103" == datetime_to_cdx(
+        datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+    )
 
-def requests_retry_session(retries=10, backoff_factor=3,
-        status_forcelist=(500, 502, 504), session=None) -> requests.Session:
+
+def requests_retry_session(
+    retries: int = 10,
+    backoff_factor: int = 1,
+    status_forcelist: List[int] = [500, 502, 504],
+    session: Optional[requests.Session] = None,
+) -> requests.Session:
     """
     From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
     """
@@ -216,7 +295,23 @@ def requests_retry_session(retries=10, backoff_factor=3,
         status_forcelist=status_forcelist,
     )
     adapter = HTTPAdapter(max_retries=retry)
-    session.mount('http://', adapter)
-    session.mount('https://', adapter)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
     return session
 
+
+def sanitize_fs_path(path: str) -> str:
+    """
+    From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
+    """
+    # - pretending to chroot to the current directory
+    # - cancelling all redundant paths (/.. = /)
+    # - making the path relative
+    return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
+
+
+def test_sanitize_fs_path() -> None:
+    assert sanitize_fs_path("/thing.png") == "thing.png"
+    assert sanitize_fs_path("../../thing.png") == "thing.png"
+    assert sanitize_fs_path("thing.png") == "thing.png"
+    assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png"