move fuzzy URL match method to misc

author: Bryan Newbold <bnewbold@archive.org> 2020-11-08 21:55:12 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-08 21:55:12 -0800
commit: b6911f63a277007523e0dc265a339a80be80946e (patch)
tree: 56469e3cfdf146e235cdda50a5be68deb1406c18 /python
parent: a68aadc4107fc68dc2748c52dab8a4bd92cca022 (diff)
download: sandcrawler-b6911f63a277007523e0dc265a339a80be80946e.tar.gz
sandcrawler-b6911f63a277007523e0dc265a339a80be80946e.zip
3 files changed, 20 insertions, 19 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index c293a2d..958e81f 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -12,7 +12,7 @@ import pydantic
 from selectolax.parser import HTMLParser
 
 from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError
-from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url
+from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal
 from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
 
 
@@ -221,24 +221,6 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
 
     return None
 
-
-def url_fuzzy_equal(left: str, right: str) -> bool:
-    """
-    TODO: use proper surt library and canonicalization for this check
-    """
-    fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
-    fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
-    if fuzzy_left == fuzzy_right:
-        return True
-    elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
-        return True
-    return False
-
-def test_url_fuzzy_equal() -> None:
-    assert True == url_fuzzy_equal(
-        "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
-        "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree")
-
 def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
     """
     This function tries to guess if an HTML document represents one of:
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 15f44f4..a52d339 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -9,6 +9,8 @@ from selectolax.parser import HTMLParser
 import pydantic
 import braveblock
 
+from sandcrawler.misc import url_fuzzy_equal
+
 
 # this is a map of metadata keys to CSS selectors
 # sources for this list include:
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 38b2803..a3e2960 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -18,6 +18,23 @@ def clean_url(s: str) -> str:
         parsed.colon_before_port = b''
     return str(urlcanon.whatwg(parsed))
 
+def url_fuzzy_equal(left: str, right: str) -> bool:
+    """
+    TODO: use proper surt library and canonicalization for this check
+    """
+    fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
+    fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
+    if fuzzy_left == fuzzy_right:
+        return True
+    elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
+        return True
+    return False
+
+def test_url_fuzzy_equal() -> None:
+    assert True == url_fuzzy_equal(
+        "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+        "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree")
+
 def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
     """
     Takes a file blob (bytestream) and returns hashes and other metadata.
author	Bryan Newbold <bnewbold@archive.org>	2020-11-08 21:55:12 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-08 21:55:12 -0800
commit	b6911f63a277007523e0dc265a339a80be80946e (patch)
tree	56469e3cfdf146e235cdda50a5be68deb1406c18 /python
parent	a68aadc4107fc68dc2748c52dab8a4bd92cca022 (diff)
download	sandcrawler-b6911f63a277007523e0dc265a339a80be80946e.tar.gz sandcrawler-b6911f63a277007523e0dc265a339a80be80946e.zip