1 files changed, 41 insertions, 7 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index d0c3e0e..03277f8 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -20,6 +20,7 @@ from sandcrawler.ia import (
     NoCaptureError,
     PetaboxError,
     ResourceResult,
+    SavePageNowBackoffError,
     SavePageNowClient,
     SavePageNowError,
     WaybackClient,
@@ -103,7 +104,7 @@ class IngestFileWorker(SandcrawlerWorker):
         self.pdftext_sink = kwargs.get("pdftext_sink")
         self.xmldoc_sink = kwargs.get("xmldoc_sink")
         self.htmlteixml_sink = kwargs.get("htmlteixml_sink")
-        self.max_hops = 6
+        self.max_hops = 8
 
         self.try_existing_ingest = kwargs.get("try_existing_ingest", False)
         self.try_existing_grobid = kwargs.get("try_existing_grobid", True)
@@ -115,8 +116,11 @@ class IngestFileWorker(SandcrawlerWorker):
         self.max_html_resources = 200
 
         self.base_url_blocklist = [
-            # robot blocking
+            "://localhost/",
+            "://127.0.0.1/",
+            # robot blocking / rate-limited
             "://hkvalidate.perfdrive.com/",
+            "://ieeexplore.ieee.org/",
             # temporary, until we implement specific fetch and 'petabox' output
             "://archive.org/",
             "://www.archive.org/",
@@ -125,8 +129,8 @@ class IngestFileWorker(SandcrawlerWorker):
             "://openlibrary.org/",
             "://www.openlibrary.org/",
             "://fatcat.wiki/",
+            "://scholar.archive.org/",
             "://orcid.org/",
-            "://doaj.org/",
             # Domain squats
             "://bartandjones.com",
             "://ijretm.com",
@@ -150,8 +154,11 @@ class IngestFileWorker(SandcrawlerWorker):
             "doi.org/10.2307/",  # JSTOR; slow and many redirects
             "doi.org/10.18730/",  # fao.org: database entry
             "doi.org/10.15468/",  # gbif.org: database entry
+            "doi.org/10.48550/",  # arxiv.org: redundant with direct ingest
             # deprecated domain (doesn't redirect correctly)
             "://edoc.mpg.de/",
+            # bogus/spam PDFs
+            "://isiarticles.com/",
         ]
 
         self.wall_blocklist = [
@@ -163,12 +170,18 @@ class IngestFileWorker(SandcrawlerWorker):
             "/password-login",
             "://gateway.isiknowledge.com/",
             "/login?TARGET=",
+            "jstage.jst.go.jp/sblogin",
+            "://acw.elsevier.com/SSOCore",
+            "://acw.sciencedirect.com/SSOCore",
+            "/login?source=",
         ]
 
         self.cookie_blocklist = [
             "/cookieAbsent",
             "cookieSet=1",
             "error=cookies_not_supported",
+            # SPNv2 seems to work (not end up here), but heritrix fails
+            "://secure.jbs.elsevierhealth.com/",
         ]
 
         self.src_valid_mimetypes = [
@@ -445,7 +458,10 @@ class IngestFileWorker(SandcrawlerWorker):
             return dict(status="html-selectolax-error")
         html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
         assert html_biblio
-        html_body = html_extract_body_teixml(resource.body)
+        try:
+            html_body = html_extract_body_teixml(resource.body)
+        except xml.etree.ElementTree.ParseError:
+            return dict(status="html-teixml-error")
         html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
         html_scope = html_guess_scope(
             resource.terminal_url, html_doc, html_biblio, html_body.get("word_count")
@@ -610,7 +626,7 @@ class IngestFileWorker(SandcrawlerWorker):
                     result["status"] = "skip-url-blocklist"
                     return result
 
-            # check against known loginwall URLs
+            # also check against known loginwall patterns
             for block in self.wall_blocklist:
                 if block in next_url:
                     # TODO: blocked-wall instead of skip-wall
@@ -632,6 +648,12 @@ class IngestFileWorker(SandcrawlerWorker):
                 result["status"] = "spn2-error"
                 result["error_message"] = str(e)[:1600]
                 return result
+            except SavePageNowBackoffError as e:
+                result["status"] = "spn2-backoff"
+                result["error_message"] = str(e)[:1600]
+                # small sleep as a slow-down
+                time.sleep(2.0)
+                return result
             except PetaboxError as e:
                 result["status"] = "petabox-error"
                 result["error_message"] = str(e)[:1600]
@@ -683,7 +705,7 @@ class IngestFileWorker(SandcrawlerWorker):
                         return result
 
             if not resource.body:
-                result["status"] = "null-body"
+                result["status"] = "empty-blob"
                 return result
 
             if len(resource.body) > MAX_BODY_SIZE_BYTES:
@@ -699,7 +721,7 @@ class IngestFileWorker(SandcrawlerWorker):
                 return result
 
             if not resource.body or file_meta["size_bytes"] == 0:
-                result["status"] = "null-body"
+                result["status"] = "empty-blob"
                 return result
 
             # here we split based on ingest type to try and extract a next hop
@@ -737,6 +759,12 @@ class IngestFileWorker(SandcrawlerWorker):
 
                 result["extract_next_hop"] = fulltext_url
                 if not fulltext_url:
+                    # check if we hit a paywall/loginwall
+                    for block in self.wall_blocklist:
+                        if block in resource.terminal_url:
+                            result["status"] = "blocked-wall"
+                            return result
+                    # else, just failed to find link
                     result["status"] = "no-pdf-link"
                     return result
                 next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or ""
@@ -816,6 +844,12 @@ class IngestFileWorker(SandcrawlerWorker):
         if resource.revisit_cdx:
             result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
 
+        # check if we hit a paywall/loginwall before trying mimetype
+        for block in self.wall_blocklist:
+            if block in resource.terminal_url:
+                result["status"] = "blocked-wall"
+                return result
+
         if ingest_type == "pdf":
             if file_meta["mimetype"] != "application/pdf":
                 result["status"] = "wrong-mimetype"  # formerly: "other-mimetype"