From 826c7538e091fac14d987a3cd654975da964e240 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 27 Oct 2021 18:50:17 -0700 Subject: make fmt (black 21.9b0) --- python/sandcrawler/misc.py | 115 +++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 51 deletions(-) (limited to 'python/sandcrawler/misc.py') diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 1c779ce..db001dd 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -15,7 +15,7 @@ def clean_url(s: str) -> str: s = s.strip() parsed = urlcanon.parse_url(s) if not parsed.port and parsed.colon_before_port: - parsed.colon_before_port = b'' + parsed.colon_before_port = b"" return str(urlcanon.whatwg(parsed)) @@ -23,10 +23,12 @@ def url_fuzzy_equal(left: str, right: str) -> bool: """ TODO: use proper surt library and canonicalization for this check """ - fuzzy_left = '://'.join( - clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:]) - fuzzy_right = '://'.join( - clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:]) + fuzzy_left = "://".join( + clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:] + ) + fuzzy_right = "://".join( + clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:] + ) if fuzzy_left == fuzzy_right: return True elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/": @@ -35,10 +37,13 @@ def url_fuzzy_equal(left: str, right: str) -> bool: def test_url_fuzzy_equal() -> None: - assert url_fuzzy_equal( - "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", - "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree" - ) is True + assert ( + url_fuzzy_equal( + "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", + "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", + ) + is True + ) def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: @@ -53,7 +58,7 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: if len(blob) < 1024 * 1024: mimetype = magic.Magic(mime=True).from_buffer(blob) else: - mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024 * 1024)]) + mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)]) if mimetype in ("application/xml", "text/xml"): # crude checks for XHTML or JATS XML, using only first 1 kB of file if b" dict: assert path is not None mimetype = magic.Magic(mime=True).from_file(path) if mimetype in ("application/xml", "text/xml"): - with open(path, 'rb') as f: + with open(path, "rb") as f: blob = f.read(1024) # crude checks for XHTML or JATS XML, using only first 1 kB of file - if b" dict: hashlib.md5(), ] size_bytes = 0 - with open(path, 'rb') as f: + with open(path, "rb") as f: while True: chunk = f.read(1024 * 1024) if not chunk: @@ -128,15 +136,15 @@ def b32_hex(s: str) -> str: if len(s) == 40: return s raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") NORMAL_MIME = ( - 'application/pdf', - 'application/postscript', - 'text/html', - 'text/xml', - 'application/octet-stream', + "application/pdf", + "application/postscript", + "text/html", + "text/xml", + "application/octet-stream", ) @@ -147,22 +155,22 @@ def normalize_mime(raw: str) -> Optional[str]: return norm # Special cases - if raw.startswith('application/xml'): - return 'text/xml' - if raw.startswith('application/x-pdf'): - return 'application/pdf' - if raw in ('.pdf', ): - return 'application/pdf' + if raw.startswith("application/xml"): + return "text/xml" + if raw.startswith("application/x-pdf"): + return "application/pdf" + if raw in (".pdf",): + return "application/pdf" if raw in ( - 'application/download', - 'binary/octet-stream', - 'unk', - 'application/x-download', - 'application/octetstream', - 'application/force-download', - 'application/unknown', + "application/download", + "binary/octet-stream", + "unk", + "application/x-download", + "application/octetstream", + "application/force-download", + "application/unknown", ): - return 'application/octet-stream' + return "application/octet-stream" return None @@ -200,14 +208,19 @@ def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]: offset = cdx[9] warc = cdx[10] - if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit() and len(sha1b32) == 32 - and dt.isdigit()): + if not ( + sha1b32.isalnum() + and c_size.isdigit() + and offset.isdigit() + and len(sha1b32) == 32 + and dt.isdigit() + ): return None - if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc): + if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc): return None - if mime is None or mime == '-': + if mime is None or mime == "-": mime = "application/octet-stream" if normalize: @@ -242,16 +255,13 @@ def test_parse_cdx_datetime() -> None: assert parse_cdx_datetime("") is None assert parse_cdx_datetime("asdf") is None assert parse_cdx_datetime("19930203123045") is not None - assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, - month=10, - day=28, - hour=23, - minute=51, - second=3) + assert parse_cdx_datetime("20201028235103") == datetime.datetime( + year=2020, month=10, day=28, hour=23, minute=51, second=3 + ) def datetime_to_cdx(dt: datetime.datetime) -> str: - return '%04d%02d%02d%02d%02d%02d' % ( + return "%04d%02d%02d%02d%02d%02d" % ( dt.year, dt.month, dt.day, @@ -263,13 +273,16 @@ def datetime_to_cdx(dt: datetime.datetime) -> str: def test_datetime_to_cdx() -> None: assert "20201028235103" == datetime_to_cdx( - datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)) + datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3) + ) -def requests_retry_session(retries: int = 10, - backoff_factor: int = 3, - status_forcelist: List[int] = [500, 502, 504], - session: requests.Session = None) -> requests.Session: +def requests_retry_session( + retries: int = 10, + backoff_factor: int = 3, + status_forcelist: List[int] = [500, 502, 504], + session: requests.Session = None, +) -> requests.Session: """ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests """ @@ -282,8 +295,8 @@ def requests_retry_session(retries: int = 10, status_forcelist=status_forcelist, ) adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) + session.mount("http://", adapter) + session.mount("https://", adapter) return session -- cgit v1.2.3