1 files changed, 189 insertions, 53 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index d9c9d55..4e37036 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,28 +1,70 @@
-
 import base64
-import magic
-import hashlib
 import datetime
+import hashlib
+import os
+from typing import List, Optional
+
+import magic
 import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
 import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error
 
 
-def clean_url(s):
+def clean_url(s: str) -> str:
+    s = s.strip()
     parsed = urlcanon.parse_url(s)
     if not parsed.port and parsed.colon_before_port:
-        parsed.colon_before_port = b''
+        parsed.colon_before_port = b""
     return str(urlcanon.whatwg(parsed))
 
-def gen_file_metadata(blob):
+
+def url_fuzzy_equal(left: str, right: str) -> bool:
+    """
+    TODO: use proper surt library and canonicalization for this check
+    """
+    fuzzy_left = "://".join(
+        clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:]
+    )
+    fuzzy_right = "://".join(
+        clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:]
+    )
+    if fuzzy_left == fuzzy_right:
+        return True
+    elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
+        return True
+    return False
+
+
+def test_url_fuzzy_equal() -> None:
+    assert (
+        url_fuzzy_equal(
+            "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+            "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+        )
+        is True
+    )
+
+
+def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
     """
     Takes a file blob (bytestream) and returns hashes and other metadata.
 
     Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype
     """
-    assert blob
-    mimetype = magic.Magic(mime=True).from_buffer(blob)
+    assert blob is not None
+    if not allow_empty:
+        assert blob
+    if len(blob) < 1024 * 1024:
+        mimetype = magic.Magic(mime=True).from_buffer(blob)
+    else:
+        mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)])
+    if mimetype in ("application/xml", "text/xml"):
+        # crude checks for XHTML or JATS XML, using only first 1 kB of file
+        if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
+            mimetype = "application/xhtml+xml"
+        elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+            mimetype = "application/jats+xml"
     hashes = [
         hashlib.sha1(),
         hashlib.sha256(),
@@ -38,7 +80,50 @@ def gen_file_metadata(blob):
         mimetype=mimetype,
     )
 
-def b32_hex(s):
+
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+    """
+    Variant of gen_file_metadata() which works with files on local disk
+    """
+    assert path is not None
+    mimetype = magic.Magic(mime=True).from_file(path)
+    if mimetype in ("application/xml", "text/xml"):
+        with open(path, "rb") as f:
+            blob = f.read(1024)
+            # crude checks for XHTML or JATS XML, using only first 1 kB of file
+            if (
+                b"<htm" in blob[:1024]
+                and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]
+            ):
+                mimetype = "application/xhtml+xml"
+            elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+                mimetype = "application/jats+xml"
+    hashes = [
+        hashlib.sha1(),
+        hashlib.sha256(),
+        hashlib.md5(),
+    ]
+    size_bytes = 0
+    with open(path, "rb") as f:
+        while True:
+            chunk = f.read(1024 * 1024)
+            if not chunk:
+                break
+            size_bytes += len(chunk)
+            for h in hashes:
+                h.update(chunk)
+    if not allow_empty:
+        assert size_bytes > 0
+    return dict(
+        size_bytes=size_bytes,
+        sha1hex=hashes[0].hexdigest(),
+        sha256hex=hashes[1].hexdigest(),
+        md5hex=hashes[2].hexdigest(),
+        mimetype=mimetype,
+    )
+
+
+def b32_hex(s: str) -> str:
     """
     Converts a base32-encoded SHA-1 checksum into hex-encoded
 
@@ -51,45 +136,45 @@ def b32_hex(s):
         if len(s) == 40:
             return s
         raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
 
 NORMAL_MIME = (
-    'application/pdf',
-    'application/postscript',
-    'text/html',
-    'text/xml',
-    'application/octet-stream',
+    "application/pdf",
+    "application/postscript",
+    "text/html",
+    "text/xml",
+    "application/octet-stream",
 )
 
-def normalize_mime(raw):
+
+def normalize_mime(raw: str) -> Optional[str]:
     raw = raw.lower().strip()
     for norm in NORMAL_MIME:
         if raw.startswith(norm):
             return norm
 
     # Special cases
-    if raw.startswith('application/xml'):
-        return 'text/xml'
-    if raw.startswith('application/x-pdf'):
-        return 'application/pdf'
-    if raw in (
-            '.pdf',
-            ):
-        return 'application/pdf'
+    if raw.startswith("application/xml"):
+        return "text/xml"
+    if raw.startswith("application/x-pdf"):
+        return "application/pdf"
+    if raw in (".pdf",):
+        return "application/pdf"
     if raw in (
-            'application/download',
-            'binary/octet-stream',
-            'unk',
-            'application/x-download',
-            'application/octetstream',
-            'application/force-download',
-            'application/unknown',
-            ):
-        return 'application/octet-stream'
+        "application/download",
+        "binary/octet-stream",
+        "unk",
+        "application/x-download",
+        "application/octetstream",
+        "application/force-download",
+        "application/unknown",
+    ):
+        return "application/octet-stream"
     return None
 
 
-def test_normalize_mime():
+def test_normalize_mime() -> None:
     assert normalize_mime("asdf") is None
     assert normalize_mime("application/pdf") == "application/pdf"
     assert normalize_mime("application/pdf+journal") == "application/pdf"
@@ -102,7 +187,7 @@ def test_normalize_mime():
     assert normalize_mime("binary/octet-stream") == "application/octet-stream"
 
 
-def parse_cdx_line(raw_cdx, normalize=True):
+def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
     """
     This method always filters a few things out:
 
@@ -123,46 +208,81 @@ def parse_cdx_line(raw_cdx, normalize=True):
     offset = cdx[9]
     warc = cdx[10]
 
-    if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
-            and len(sha1b32) == 32 and dt.isdigit()):
+    if not (
+        sha1b32.isalnum()
+        and c_size.isdigit()
+        and offset.isdigit()
+        and len(sha1b32) == 32
+        and dt.isdigit()
+    ):
         return None
 
-    if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+    if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
         return None
 
-    if mime is None or mime == '-':
+    if mime is None or mime == "-":
         mime = "application/octet-stream"
 
     if normalize:
         mime = normalize_mime(mime)
 
     sha1hex = b32_hex(sha1b32)
-    http_status = int(http_status)
-    c_size = int(c_size)
-    offset = int(offset)
 
     return dict(
         surt=surt,
         url=url,
         datetime=dt,
         mimetype=mime,
-        http_status=http_status,
+        http_status=int(http_status),
         sha1b32=sha1b32,
         sha1hex=sha1hex,
-        warc_csize=c_size,
-        warc_offset=offset,
+        warc_csize=int(c_size),
+        warc_offset=int(offset),
         warc_path=warc,
     )
 
-def parse_cdx_datetime(dt_str):
+
+def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
+    if not dt_str:
+        return None
     try:
-        return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+        return datetime.datetime.strptime(dt_str, "%Y%m%d%H%M%S")
     except Exception:
         return None
 
 
-def requests_retry_session(retries=10, backoff_factor=3,
-        status_forcelist=(500, 502, 504), session=None):
+def test_parse_cdx_datetime() -> None:
+    assert parse_cdx_datetime("") is None
+    assert parse_cdx_datetime("asdf") is None
+    assert parse_cdx_datetime("19930203123045") is not None
+    assert parse_cdx_datetime("20201028235103") == datetime.datetime(
+        year=2020, month=10, day=28, hour=23, minute=51, second=3
+    )
+
+
+def datetime_to_cdx(dt: datetime.datetime) -> str:
+    return "%04d%02d%02d%02d%02d%02d" % (
+        dt.year,
+        dt.month,
+        dt.day,
+        dt.hour,
+        dt.minute,
+        dt.second,
+    )
+
+
+def test_datetime_to_cdx() -> None:
+    assert "20201028235103" == datetime_to_cdx(
+        datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+    )
+
+
+def requests_retry_session(
+    retries: int = 10,
+    backoff_factor: int = 1,
+    status_forcelist: List[int] = [500, 502, 504],
+    session: Optional[requests.Session] = None,
+) -> requests.Session:
     """
     From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
     """
@@ -175,7 +295,23 @@ def requests_retry_session(retries=10, backoff_factor=3,
         status_forcelist=status_forcelist,
     )
     adapter = HTTPAdapter(max_retries=retry)
-    session.mount('http://', adapter)
-    session.mount('https://', adapter)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
     return session
 
+
+def sanitize_fs_path(path: str) -> str:
+    """
+    From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
+    """
+    # - pretending to chroot to the current directory
+    # - cancelling all redundant paths (/.. = /)
+    # - making the path relative
+    return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
+
+
+def test_sanitize_fs_path() -> None:
+    assert sanitize_fs_path("/thing.png") == "thing.png"
+    assert sanitize_fs_path("../../thing.png") == "thing.png"
+    assert sanitize_fs_path("thing.png") == "thing.png"
+    assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png"