diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 21:55:12 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 21:55:12 -0800 |
commit | b6911f63a277007523e0dc265a339a80be80946e (patch) | |
tree | 56469e3cfdf146e235cdda50a5be68deb1406c18 /python/sandcrawler/misc.py | |
parent | a68aadc4107fc68dc2748c52dab8a4bd92cca022 (diff) | |
download | sandcrawler-b6911f63a277007523e0dc265a339a80be80946e.tar.gz sandcrawler-b6911f63a277007523e0dc265a339a80be80946e.zip |
move fuzzy URL match method to misc
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r-- | python/sandcrawler/misc.py | 17 |
1 files changed, 17 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 38b2803..a3e2960 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -18,6 +18,23 @@ def clean_url(s: str) -> str: parsed.colon_before_port = b'' return str(urlcanon.whatwg(parsed)) +def url_fuzzy_equal(left: str, right: str) -> bool: + """ + TODO: use proper surt library and canonicalization for this check + """ + fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:]) + fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:]) + if fuzzy_left == fuzzy_right: + return True + elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/": + return True + return False + +def test_url_fuzzy_equal() -> None: + assert True == url_fuzzy_equal( + "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", + "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree") + def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: """ Takes a file blob (bytestream) and returns hashes and other metadata. |