html: fulltext URL prefixes to skip; also fix broken pattern matching

Due to both the 'continue-in-a-for-loop' and 'missing-trailing-commas', the existing pattern matching was not working.
author: Bryan Newbold <bnewbold@archive.org> 2022-07-15 13:52:05 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-07-15 13:52:07 -0700
commit: b10105cbf74f87acf417dfea9e324b1dbff3b8ec (patch)
tree: d77f51038a6ae39c88fe687c3b3f46b45d501071
parent: 3f8fed325a3dd8d51652dffab89880c1cf25656b (diff)
download: sandcrawler-b10105cbf74f87acf417dfea9e324b1dbff3b8ec.tar.gz
sandcrawler-b10105cbf74f87acf417dfea9e324b1dbff3b8ec.zip
1 files changed, 19 insertions, 4 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 180c6a2..c64d4f3 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -650,9 +650,14 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
 
 FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
     # wiley has a weird almost-blank page we don't want to loop on
-    "://onlinelibrary.wiley.com/doi/pdf/"
-    "://doi.org/"
-    "://dx.doi.org/"
+    "://onlinelibrary.wiley.com/doi/pdf/",
+    "://doi.org/",
+    "://dx.doi.org/",
+]
+
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+    "javascript:",
+    "about:",
 ]
 
 RELEASE_TYPE_MAP: Dict[str, str] = {
@@ -736,9 +741,19 @@ def html_extract_fulltext_url(
         if "in_fulltext_url" in pattern:
             if pattern["in_fulltext_url"] not in val:
                 continue
+        skip_matched = False
         for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
             if skip_pattern in val.lower():
-                continue
+                skip_matched = True
+                break
+        if skip_matched:
+            continue
+        for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+            if val.lower().startswith(skip_pattern):
+                skip_matched = True
+                break
+        if skip_matched:
+            continue
         if url_fuzzy_equal(doc_url, val):
             # don't link to self, unless no other options
             self_doc_url = (val, pattern.get("technique", "unknown"))
author	Bryan Newbold <bnewbold@archive.org>	2022-07-15 13:52:05 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-07-15 13:52:07 -0700
commit	b10105cbf74f87acf417dfea9e324b1dbff3b8ec (patch)
tree	d77f51038a6ae39c88fe687c3b3f46b45d501071
parent	3f8fed325a3dd8d51652dffab89880c1cf25656b (diff)
download	sandcrawler-b10105cbf74f87acf417dfea9e324b1dbff3b8ec.tar.gz sandcrawler-b10105cbf74f87acf417dfea9e324b1dbff3b8ec.zip