diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 13:52:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 13:52:07 -0700 |
commit | b10105cbf74f87acf417dfea9e324b1dbff3b8ec (patch) | |
tree | d77f51038a6ae39c88fe687c3b3f46b45d501071 | |
parent | 3f8fed325a3dd8d51652dffab89880c1cf25656b (diff) | |
download | sandcrawler-b10105cbf74f87acf417dfea9e324b1dbff3b8ec.tar.gz sandcrawler-b10105cbf74f87acf417dfea9e324b1dbff3b8ec.zip |
html: fulltext URL prefixes to skip; also fix broken pattern matching
Due to both the 'continue-in-a-for-loop' and 'missing-trailing-commas',
the existing pattern matching was not working.
-rw-r--r-- | python/sandcrawler/html_metadata.py | 23 |
1 files changed, 19 insertions, 4 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 180c6a2..c64d4f3 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -650,9 +650,14 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ # wiley has a weird almost-blank page we don't want to loop on - "://onlinelibrary.wiley.com/doi/pdf/" - "://doi.org/" - "://dx.doi.org/" + "://onlinelibrary.wiley.com/doi/pdf/", + "://doi.org/", + "://dx.doi.org/", +] + +FULLTEXT_URL_PREFIX_SKIP: List[str] = [ + "javascript:", + "about:", ] RELEASE_TYPE_MAP: Dict[str, str] = { @@ -736,9 +741,19 @@ def html_extract_fulltext_url( if "in_fulltext_url" in pattern: if pattern["in_fulltext_url"] not in val: continue + skip_matched = False for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP: if skip_pattern in val.lower(): - continue + skip_matched = True + break + if skip_matched: + continue + for skip_pattern in FULLTEXT_URL_PREFIX_SKIP: + if val.lower().startswith(skip_pattern): + skip_matched = True + break + if skip_matched: + continue if url_fuzzy_equal(doc_url, val): # don't link to self, unless no other options self_doc_url = (val, pattern.get("technique", "unknown")) |