From 826c7538e091fac14d987a3cd654975da964e240 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 27 Oct 2021 18:50:17 -0700
Subject: make fmt (black 21.9b0)

---
 python/sandcrawler/misc.py | 115 +++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 51 deletions(-)

(limited to 'python/sandcrawler/misc.py')
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 1c779ce..db001dd 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -15,7 +15,7 @@ def clean_url(s: str) -> str:
     s = s.strip()
     parsed = urlcanon.parse_url(s)
     if not parsed.port and parsed.colon_before_port:
-        parsed.colon_before_port = b''
+        parsed.colon_before_port = b""
     return str(urlcanon.whatwg(parsed))
 
 
@@ -23,10 +23,12 @@ def url_fuzzy_equal(left: str, right: str) -> bool:
     """
     TODO: use proper surt library and canonicalization for this check
     """
-    fuzzy_left = '://'.join(
-        clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
-    fuzzy_right = '://'.join(
-        clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
+    fuzzy_left = "://".join(
+        clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:]
+    )
+    fuzzy_right = "://".join(
+        clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:]
+    )
     if fuzzy_left == fuzzy_right:
         return True
     elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
@@ -35,10 +37,13 @@ def url_fuzzy_equal(left: str, right: str) -> bool:
 
 
 def test_url_fuzzy_equal() -> None:
-    assert url_fuzzy_equal(
-        "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
-        "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree"
-    ) is True
+    assert (
+        url_fuzzy_equal(
+            "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+            "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+        )
+        is True
+    )
 
 
 def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
@@ -53,7 +58,7 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
     if len(blob) < 1024 * 1024:
         mimetype = magic.Magic(mime=True).from_buffer(blob)
     else:
-        mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024 * 1024)])
+        mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)])
     if mimetype in ("application/xml", "text/xml"):
         # crude checks for XHTML or JATS XML, using only first 1 kB of file
         if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
@@ -83,10 +88,13 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
     assert path is not None
     mimetype = magic.Magic(mime=True).from_file(path)
     if mimetype in ("application/xml", "text/xml"):
-        with open(path, 'rb') as f:
+        with open(path, "rb") as f:
             blob = f.read(1024)
             # crude checks for XHTML or JATS XML, using only first 1 kB of file
-            if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
+            if (
+                b"<htm" in blob[:1024]
+                and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]
+            ):
                 mimetype = "application/xhtml+xml"
             elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
                 mimetype = "application/jats+xml"
@@ -96,7 +104,7 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
         hashlib.md5(),
     ]
     size_bytes = 0
-    with open(path, 'rb') as f:
+    with open(path, "rb") as f:
         while True:
             chunk = f.read(1024 * 1024)
             if not chunk:
@@ -128,15 +136,15 @@ def b32_hex(s: str) -> str:
         if len(s) == 40:
             return s
         raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
 
 
 NORMAL_MIME = (
-    'application/pdf',
-    'application/postscript',
-    'text/html',
-    'text/xml',
-    'application/octet-stream',
+    "application/pdf",
+    "application/postscript",
+    "text/html",
+    "text/xml",
+    "application/octet-stream",
 )
 
 
@@ -147,22 +155,22 @@ def normalize_mime(raw: str) -> Optional[str]:
             return norm
 
     # Special cases
-    if raw.startswith('application/xml'):
-        return 'text/xml'
-    if raw.startswith('application/x-pdf'):
-        return 'application/pdf'
-    if raw in ('.pdf', ):
-        return 'application/pdf'
+    if raw.startswith("application/xml"):
+        return "text/xml"
+    if raw.startswith("application/x-pdf"):
+        return "application/pdf"
+    if raw in (".pdf",):
+        return "application/pdf"
     if raw in (
-            'application/download',
-            'binary/octet-stream',
-            'unk',
-            'application/x-download',
-            'application/octetstream',
-            'application/force-download',
-            'application/unknown',
+        "application/download",
+        "binary/octet-stream",
+        "unk",
+        "application/x-download",
+        "application/octetstream",
+        "application/force-download",
+        "application/unknown",
     ):
-        return 'application/octet-stream'
+        return "application/octet-stream"
     return None
 
 
@@ -200,14 +208,19 @@ def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
     offset = cdx[9]
     warc = cdx[10]
 
-    if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit() and len(sha1b32) == 32
-            and dt.isdigit()):
+    if not (
+        sha1b32.isalnum()
+        and c_size.isdigit()
+        and offset.isdigit()
+        and len(sha1b32) == 32
+        and dt.isdigit()
+    ):
         return None
 
-    if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+    if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
         return None
 
-    if mime is None or mime == '-':
+    if mime is None or mime == "-":
         mime = "application/octet-stream"
 
     if normalize:
@@ -242,16 +255,13 @@ def test_parse_cdx_datetime() -> None:
     assert parse_cdx_datetime("") is None
     assert parse_cdx_datetime("asdf") is None
     assert parse_cdx_datetime("19930203123045") is not None
-    assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020,
-                                                                     month=10,
-                                                                     day=28,
-                                                                     hour=23,
-                                                                     minute=51,
-                                                                     second=3)
+    assert parse_cdx_datetime("20201028235103") == datetime.datetime(
+        year=2020, month=10, day=28, hour=23, minute=51, second=3
+    )
 
 
 def datetime_to_cdx(dt: datetime.datetime) -> str:
-    return '%04d%02d%02d%02d%02d%02d' % (
+    return "%04d%02d%02d%02d%02d%02d" % (
         dt.year,
         dt.month,
         dt.day,
@@ -263,13 +273,16 @@ def datetime_to_cdx(dt: datetime.datetime) -> str:
 
 def test_datetime_to_cdx() -> None:
     assert "20201028235103" == datetime_to_cdx(
-        datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
+        datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+    )
 
 
-def requests_retry_session(retries: int = 10,
-                           backoff_factor: int = 3,
-                           status_forcelist: List[int] = [500, 502, 504],
-                           session: requests.Session = None) -> requests.Session:
+def requests_retry_session(
+    retries: int = 10,
+    backoff_factor: int = 3,
+    status_forcelist: List[int] = [500, 502, 504],
+    session: requests.Session = None,
+) -> requests.Session:
     """
     From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
     """
@@ -282,8 +295,8 @@ def requests_retry_session(retries: int = 10,
         status_forcelist=status_forcelist,
     )
     adapter = HTTPAdapter(max_retries=retry)
-    session.mount('http://', adapter)
-    session.mount('https://', adapter)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
     return session
 
 
-- 
cgit v1.2.3