diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/html_ingest.py | 20 | ||||
| -rw-r--r-- | python/sandcrawler/html_metadata.py | 2 | ||||
| -rw-r--r-- | python/sandcrawler/misc.py | 17 | 
3 files changed, 20 insertions, 19 deletions
| diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index c293a2d..958e81f 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -12,7 +12,7 @@ import pydantic  from selectolax.parser import HTMLParser  from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError -from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url +from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal  from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules @@ -221,24 +221,6 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada      return None - -def url_fuzzy_equal(left: str, right: str) -> bool: -    """ -    TODO: use proper surt library and canonicalization for this check -    """ -    fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:]) -    fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:]) -    if fuzzy_left == fuzzy_right: -        return True -    elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/": -        return True -    return False - -def test_url_fuzzy_equal() -> None: -    assert True == url_fuzzy_equal( -        "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", -        "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree") -  def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:      """      This function tries to guess if an HTML document represents one of: diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 15f44f4..a52d339 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -9,6 +9,8 @@ from selectolax.parser import HTMLParser  import pydantic  import braveblock +from sandcrawler.misc import url_fuzzy_equal +  # this is a map of metadata keys to CSS selectors  # sources for this list include: diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 38b2803..a3e2960 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -18,6 +18,23 @@ def clean_url(s: str) -> str:          parsed.colon_before_port = b''      return str(urlcanon.whatwg(parsed)) +def url_fuzzy_equal(left: str, right: str) -> bool: +    """ +    TODO: use proper surt library and canonicalization for this check +    """ +    fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:]) +    fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:]) +    if fuzzy_left == fuzzy_right: +        return True +    elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/": +        return True +    return False + +def test_url_fuzzy_equal() -> None: +    assert True == url_fuzzy_equal( +        "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", +        "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree") +  def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:      """      Takes a file blob (bytestream) and returns hashes and other metadata. | 
