From b6911f63a277007523e0dc265a339a80be80946e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 21:55:12 -0800 Subject: move fuzzy URL match method to misc --- python/sandcrawler/html_ingest.py | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'python/sandcrawler/html_ingest.py') diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index c293a2d..958e81f 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -12,7 +12,7 @@ import pydantic from selectolax.parser import HTMLParser from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError -from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url +from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules @@ -221,24 +221,6 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada return None - -def url_fuzzy_equal(left: str, right: str) -> bool: - """ - TODO: use proper surt library and canonicalization for this check - """ - fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:]) - fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:]) - if fuzzy_left == fuzzy_right: - return True - elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/": - return True - return False - -def test_url_fuzzy_equal() -> None: - assert True == url_fuzzy_equal( - "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", - "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree") - def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str: """ This function tries to guess if an HTML document represents one of: -- cgit v1.2.3