aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-15 13:52:05 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-15 13:52:07 -0700
commitb10105cbf74f87acf417dfea9e324b1dbff3b8ec (patch)
treed77f51038a6ae39c88fe687c3b3f46b45d501071
parent3f8fed325a3dd8d51652dffab89880c1cf25656b (diff)
downloadsandcrawler-b10105cbf74f87acf417dfea9e324b1dbff3b8ec.tar.gz
sandcrawler-b10105cbf74f87acf417dfea9e324b1dbff3b8ec.zip
html: fulltext URL prefixes to skip; also fix broken pattern matching
Due to both the 'continue-in-a-for-loop' and 'missing-trailing-commas', the existing pattern matching was not working.
-rw-r--r--python/sandcrawler/html_metadata.py23
1 files changed, 19 insertions, 4 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 180c6a2..c64d4f3 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -650,9 +650,14 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
# wiley has a weird almost-blank page we don't want to loop on
- "://onlinelibrary.wiley.com/doi/pdf/"
- "://doi.org/"
- "://dx.doi.org/"
+ "://onlinelibrary.wiley.com/doi/pdf/",
+ "://doi.org/",
+ "://dx.doi.org/",
+]
+
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+ "javascript:",
+ "about:",
]
RELEASE_TYPE_MAP: Dict[str, str] = {
@@ -736,9 +741,19 @@ def html_extract_fulltext_url(
if "in_fulltext_url" in pattern:
if pattern["in_fulltext_url"] not in val:
continue
+ skip_matched = False
for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
if skip_pattern in val.lower():
- continue
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+ if val.lower().startswith(skip_pattern):
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
if url_fuzzy_equal(doc_url, val):
# don't link to self, unless no other options
self_doc_url = (val, pattern.get("technique", "unknown"))