diff options
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r-- | python/sandcrawler/misc.py | 242 |
1 files changed, 189 insertions, 53 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index d9c9d55..4e37036 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -1,28 +1,70 @@ - import base64 -import magic -import hashlib import datetime +import hashlib +import os +from typing import List, Optional + +import magic import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error import urlcanon +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error -def clean_url(s): +def clean_url(s: str) -> str: + s = s.strip() parsed = urlcanon.parse_url(s) if not parsed.port and parsed.colon_before_port: - parsed.colon_before_port = b'' + parsed.colon_before_port = b"" return str(urlcanon.whatwg(parsed)) -def gen_file_metadata(blob): + +def url_fuzzy_equal(left: str, right: str) -> bool: + """ + TODO: use proper surt library and canonicalization for this check + """ + fuzzy_left = "://".join( + clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:] + ) + fuzzy_right = "://".join( + clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:] + ) + if fuzzy_left == fuzzy_right: + return True + elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/": + return True + return False + + +def test_url_fuzzy_equal() -> None: + assert ( + url_fuzzy_equal( + "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", + "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", + ) + is True + ) + + +def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: """ Takes a file blob (bytestream) and returns hashes and other metadata. Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype """ - assert blob - mimetype = magic.Magic(mime=True).from_buffer(blob) + assert blob is not None + if not allow_empty: + assert blob + if len(blob) < 1024 * 1024: + mimetype = magic.Magic(mime=True).from_buffer(blob) + else: + mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)]) + if mimetype in ("application/xml", "text/xml"): + # crude checks for XHTML or JATS XML, using only first 1 kB of file + if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]: + mimetype = "application/xhtml+xml" + elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]: + mimetype = "application/jats+xml" hashes = [ hashlib.sha1(), hashlib.sha256(), @@ -38,7 +80,50 @@ def gen_file_metadata(blob): mimetype=mimetype, ) -def b32_hex(s): + +def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict: + """ + Variant of gen_file_metadata() which works with files on local disk + """ + assert path is not None + mimetype = magic.Magic(mime=True).from_file(path) + if mimetype in ("application/xml", "text/xml"): + with open(path, "rb") as f: + blob = f.read(1024) + # crude checks for XHTML or JATS XML, using only first 1 kB of file + if ( + b"<htm" in blob[:1024] + and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024] + ): + mimetype = "application/xhtml+xml" + elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]: + mimetype = "application/jats+xml" + hashes = [ + hashlib.sha1(), + hashlib.sha256(), + hashlib.md5(), + ] + size_bytes = 0 + with open(path, "rb") as f: + while True: + chunk = f.read(1024 * 1024) + if not chunk: + break + size_bytes += len(chunk) + for h in hashes: + h.update(chunk) + if not allow_empty: + assert size_bytes > 0 + return dict( + size_bytes=size_bytes, + sha1hex=hashes[0].hexdigest(), + sha256hex=hashes[1].hexdigest(), + md5hex=hashes[2].hexdigest(), + mimetype=mimetype, + ) + + +def b32_hex(s: str) -> str: """ Converts a base32-encoded SHA-1 checksum into hex-encoded @@ -51,45 +136,45 @@ def b32_hex(s): if len(s) == 40: return s raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") + NORMAL_MIME = ( - 'application/pdf', - 'application/postscript', - 'text/html', - 'text/xml', - 'application/octet-stream', + "application/pdf", + "application/postscript", + "text/html", + "text/xml", + "application/octet-stream", ) -def normalize_mime(raw): + +def normalize_mime(raw: str) -> Optional[str]: raw = raw.lower().strip() for norm in NORMAL_MIME: if raw.startswith(norm): return norm # Special cases - if raw.startswith('application/xml'): - return 'text/xml' - if raw.startswith('application/x-pdf'): - return 'application/pdf' - if raw in ( - '.pdf', - ): - return 'application/pdf' + if raw.startswith("application/xml"): + return "text/xml" + if raw.startswith("application/x-pdf"): + return "application/pdf" + if raw in (".pdf",): + return "application/pdf" if raw in ( - 'application/download', - 'binary/octet-stream', - 'unk', - 'application/x-download', - 'application/octetstream', - 'application/force-download', - 'application/unknown', - ): - return 'application/octet-stream' + "application/download", + "binary/octet-stream", + "unk", + "application/x-download", + "application/octetstream", + "application/force-download", + "application/unknown", + ): + return "application/octet-stream" return None -def test_normalize_mime(): +def test_normalize_mime() -> None: assert normalize_mime("asdf") is None assert normalize_mime("application/pdf") == "application/pdf" assert normalize_mime("application/pdf+journal") == "application/pdf" @@ -102,7 +187,7 @@ def test_normalize_mime(): assert normalize_mime("binary/octet-stream") == "application/octet-stream" -def parse_cdx_line(raw_cdx, normalize=True): +def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]: """ This method always filters a few things out: @@ -123,46 +208,81 @@ def parse_cdx_line(raw_cdx, normalize=True): offset = cdx[9] warc = cdx[10] - if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit() - and len(sha1b32) == 32 and dt.isdigit()): + if not ( + sha1b32.isalnum() + and c_size.isdigit() + and offset.isdigit() + and len(sha1b32) == 32 + and dt.isdigit() + ): return None - if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc): + if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc): return None - if mime is None or mime == '-': + if mime is None or mime == "-": mime = "application/octet-stream" if normalize: mime = normalize_mime(mime) sha1hex = b32_hex(sha1b32) - http_status = int(http_status) - c_size = int(c_size) - offset = int(offset) return dict( surt=surt, url=url, datetime=dt, mimetype=mime, - http_status=http_status, + http_status=int(http_status), sha1b32=sha1b32, sha1hex=sha1hex, - warc_csize=c_size, - warc_offset=offset, + warc_csize=int(c_size), + warc_offset=int(offset), warc_path=warc, ) -def parse_cdx_datetime(dt_str): + +def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]: + if not dt_str: + return None try: - return datetime.strptime(dt_str, "%Y%m%d%H%M%S") + return datetime.datetime.strptime(dt_str, "%Y%m%d%H%M%S") except Exception: return None -def requests_retry_session(retries=10, backoff_factor=3, - status_forcelist=(500, 502, 504), session=None): +def test_parse_cdx_datetime() -> None: + assert parse_cdx_datetime("") is None + assert parse_cdx_datetime("asdf") is None + assert parse_cdx_datetime("19930203123045") is not None + assert parse_cdx_datetime("20201028235103") == datetime.datetime( + year=2020, month=10, day=28, hour=23, minute=51, second=3 + ) + + +def datetime_to_cdx(dt: datetime.datetime) -> str: + return "%04d%02d%02d%02d%02d%02d" % ( + dt.year, + dt.month, + dt.day, + dt.hour, + dt.minute, + dt.second, + ) + + +def test_datetime_to_cdx() -> None: + assert "20201028235103" == datetime_to_cdx( + datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3) + ) + + +def requests_retry_session( + retries: int = 10, + backoff_factor: int = 1, + status_forcelist: List[int] = [500, 502, 504], + session: Optional[requests.Session] = None, +) -> requests.Session: """ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests """ @@ -175,7 +295,23 @@ def requests_retry_session(retries=10, backoff_factor=3, status_forcelist=status_forcelist, ) adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) + session.mount("http://", adapter) + session.mount("https://", adapter) return session + +def sanitize_fs_path(path: str) -> str: + """ + From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540 + """ + # - pretending to chroot to the current directory + # - cancelling all redundant paths (/.. = /) + # - making the path relative + return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/") + + +def test_sanitize_fs_path() -> None: + assert sanitize_fs_path("/thing.png") == "thing.png" + assert sanitize_fs_path("../../thing.png") == "thing.png" + assert sanitize_fs_path("thing.png") == "thing.png" + assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png" |