aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_file.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r--python/sandcrawler/ingest_file.py48
1 files changed, 41 insertions, 7 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index d0c3e0e..03277f8 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -20,6 +20,7 @@ from sandcrawler.ia import (
NoCaptureError,
PetaboxError,
ResourceResult,
+ SavePageNowBackoffError,
SavePageNowClient,
SavePageNowError,
WaybackClient,
@@ -103,7 +104,7 @@ class IngestFileWorker(SandcrawlerWorker):
self.pdftext_sink = kwargs.get("pdftext_sink")
self.xmldoc_sink = kwargs.get("xmldoc_sink")
self.htmlteixml_sink = kwargs.get("htmlteixml_sink")
- self.max_hops = 6
+ self.max_hops = 8
self.try_existing_ingest = kwargs.get("try_existing_ingest", False)
self.try_existing_grobid = kwargs.get("try_existing_grobid", True)
@@ -115,8 +116,11 @@ class IngestFileWorker(SandcrawlerWorker):
self.max_html_resources = 200
self.base_url_blocklist = [
- # robot blocking
+ "://localhost/",
+ "://127.0.0.1/",
+ # robot blocking / rate-limited
"://hkvalidate.perfdrive.com/",
+ "://ieeexplore.ieee.org/",
# temporary, until we implement specific fetch and 'petabox' output
"://archive.org/",
"://www.archive.org/",
@@ -125,8 +129,8 @@ class IngestFileWorker(SandcrawlerWorker):
"://openlibrary.org/",
"://www.openlibrary.org/",
"://fatcat.wiki/",
+ "://scholar.archive.org/",
"://orcid.org/",
- "://doaj.org/",
# Domain squats
"://bartandjones.com",
"://ijretm.com",
@@ -150,8 +154,11 @@ class IngestFileWorker(SandcrawlerWorker):
"doi.org/10.2307/", # JSTOR; slow and many redirects
"doi.org/10.18730/", # fao.org: database entry
"doi.org/10.15468/", # gbif.org: database entry
+ "doi.org/10.48550/", # arxiv.org: redundant with direct ingest
# deprecated domain (doesn't redirect correctly)
"://edoc.mpg.de/",
+ # bogus/spam PDFs
+ "://isiarticles.com/",
]
self.wall_blocklist = [
@@ -163,12 +170,18 @@ class IngestFileWorker(SandcrawlerWorker):
"/password-login",
"://gateway.isiknowledge.com/",
"/login?TARGET=",
+ "jstage.jst.go.jp/sblogin",
+ "://acw.elsevier.com/SSOCore",
+ "://acw.sciencedirect.com/SSOCore",
+ "/login?source=",
]
self.cookie_blocklist = [
"/cookieAbsent",
"cookieSet=1",
"error=cookies_not_supported",
+ # SPNv2 seems to work (not end up here), but heritrix fails
+ "://secure.jbs.elsevierhealth.com/",
]
self.src_valid_mimetypes = [
@@ -445,7 +458,10 @@ class IngestFileWorker(SandcrawlerWorker):
return dict(status="html-selectolax-error")
html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
assert html_biblio
- html_body = html_extract_body_teixml(resource.body)
+ try:
+ html_body = html_extract_body_teixml(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="html-teixml-error")
html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
html_scope = html_guess_scope(
resource.terminal_url, html_doc, html_biblio, html_body.get("word_count")
@@ -610,7 +626,7 @@ class IngestFileWorker(SandcrawlerWorker):
result["status"] = "skip-url-blocklist"
return result
- # check against known loginwall URLs
+ # also check against known loginwall patterns
for block in self.wall_blocklist:
if block in next_url:
# TODO: blocked-wall instead of skip-wall
@@ -632,6 +648,12 @@ class IngestFileWorker(SandcrawlerWorker):
result["status"] = "spn2-error"
result["error_message"] = str(e)[:1600]
return result
+ except SavePageNowBackoffError as e:
+ result["status"] = "spn2-backoff"
+ result["error_message"] = str(e)[:1600]
+ # small sleep as a slow-down
+ time.sleep(2.0)
+ return result
except PetaboxError as e:
result["status"] = "petabox-error"
result["error_message"] = str(e)[:1600]
@@ -683,7 +705,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if not resource.body:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
if len(resource.body) > MAX_BODY_SIZE_BYTES:
@@ -699,7 +721,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if not resource.body or file_meta["size_bytes"] == 0:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
# here we split based on ingest type to try and extract a next hop
@@ -737,6 +759,12 @@ class IngestFileWorker(SandcrawlerWorker):
result["extract_next_hop"] = fulltext_url
if not fulltext_url:
+ # check if we hit a paywall/loginwall
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+ # else, just failed to find link
result["status"] = "no-pdf-link"
return result
next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or ""
@@ -816,6 +844,12 @@ class IngestFileWorker(SandcrawlerWorker):
if resource.revisit_cdx:
result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+ # check if we hit a paywall/loginwall before trying mimetype
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+
if ingest_type == "pdf":
if file_meta["mimetype"] != "application/pdf":
result["status"] = "wrong-mimetype" # formerly: "other-mimetype"