diff options
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 48 |
1 files changed, 41 insertions, 7 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index d0c3e0e..03277f8 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -20,6 +20,7 @@ from sandcrawler.ia import ( NoCaptureError, PetaboxError, ResourceResult, + SavePageNowBackoffError, SavePageNowClient, SavePageNowError, WaybackClient, @@ -103,7 +104,7 @@ class IngestFileWorker(SandcrawlerWorker): self.pdftext_sink = kwargs.get("pdftext_sink") self.xmldoc_sink = kwargs.get("xmldoc_sink") self.htmlteixml_sink = kwargs.get("htmlteixml_sink") - self.max_hops = 6 + self.max_hops = 8 self.try_existing_ingest = kwargs.get("try_existing_ingest", False) self.try_existing_grobid = kwargs.get("try_existing_grobid", True) @@ -115,8 +116,11 @@ class IngestFileWorker(SandcrawlerWorker): self.max_html_resources = 200 self.base_url_blocklist = [ - # robot blocking + "://localhost/", + "://127.0.0.1/", + # robot blocking / rate-limited "://hkvalidate.perfdrive.com/", + "://ieeexplore.ieee.org/", # temporary, until we implement specific fetch and 'petabox' output "://archive.org/", "://www.archive.org/", @@ -125,8 +129,8 @@ class IngestFileWorker(SandcrawlerWorker): "://openlibrary.org/", "://www.openlibrary.org/", "://fatcat.wiki/", + "://scholar.archive.org/", "://orcid.org/", - "://doaj.org/", # Domain squats "://bartandjones.com", "://ijretm.com", @@ -150,8 +154,11 @@ class IngestFileWorker(SandcrawlerWorker): "doi.org/10.2307/", # JSTOR; slow and many redirects "doi.org/10.18730/", # fao.org: database entry "doi.org/10.15468/", # gbif.org: database entry + "doi.org/10.48550/", # arxiv.org: redundant with direct ingest # deprecated domain (doesn't redirect correctly) "://edoc.mpg.de/", + # bogus/spam PDFs + "://isiarticles.com/", ] self.wall_blocklist = [ @@ -163,12 +170,18 @@ class IngestFileWorker(SandcrawlerWorker): "/password-login", "://gateway.isiknowledge.com/", "/login?TARGET=", + "jstage.jst.go.jp/sblogin", + "://acw.elsevier.com/SSOCore", + "://acw.sciencedirect.com/SSOCore", + "/login?source=", ] self.cookie_blocklist = [ "/cookieAbsent", "cookieSet=1", "error=cookies_not_supported", + # SPNv2 seems to work (not end up here), but heritrix fails + "://secure.jbs.elsevierhealth.com/", ] self.src_valid_mimetypes = [ @@ -445,7 +458,10 @@ class IngestFileWorker(SandcrawlerWorker): return dict(status="html-selectolax-error") html_biblio = html_extract_biblio(resource.terminal_url, html_doc) assert html_biblio - html_body = html_extract_body_teixml(resource.body) + try: + html_body = html_extract_body_teixml(resource.body) + except xml.etree.ElementTree.ParseError: + return dict(status="html-teixml-error") html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio) html_scope = html_guess_scope( resource.terminal_url, html_doc, html_biblio, html_body.get("word_count") @@ -610,7 +626,7 @@ class IngestFileWorker(SandcrawlerWorker): result["status"] = "skip-url-blocklist" return result - # check against known loginwall URLs + # also check against known loginwall patterns for block in self.wall_blocklist: if block in next_url: # TODO: blocked-wall instead of skip-wall @@ -632,6 +648,12 @@ class IngestFileWorker(SandcrawlerWorker): result["status"] = "spn2-error" result["error_message"] = str(e)[:1600] return result + except SavePageNowBackoffError as e: + result["status"] = "spn2-backoff" + result["error_message"] = str(e)[:1600] + # small sleep as a slow-down + time.sleep(2.0) + return result except PetaboxError as e: result["status"] = "petabox-error" result["error_message"] = str(e)[:1600] @@ -683,7 +705,7 @@ class IngestFileWorker(SandcrawlerWorker): return result if not resource.body: - result["status"] = "null-body" + result["status"] = "empty-blob" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: @@ -699,7 +721,7 @@ class IngestFileWorker(SandcrawlerWorker): return result if not resource.body or file_meta["size_bytes"] == 0: - result["status"] = "null-body" + result["status"] = "empty-blob" return result # here we split based on ingest type to try and extract a next hop @@ -737,6 +759,12 @@ class IngestFileWorker(SandcrawlerWorker): result["extract_next_hop"] = fulltext_url if not fulltext_url: + # check if we hit a paywall/loginwall + for block in self.wall_blocklist: + if block in resource.terminal_url: + result["status"] = "blocked-wall" + return result + # else, just failed to find link result["status"] = "no-pdf-link" return result next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or "" @@ -816,6 +844,12 @@ class IngestFileWorker(SandcrawlerWorker): if resource.revisit_cdx: result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx) + # check if we hit a paywall/loginwall before trying mimetype + for block in self.wall_blocklist: + if block in resource.terminal_url: + result["status"] = "blocked-wall" + return result + if ingest_type == "pdf": if file_meta["mimetype"] != "application/pdf": result["status"] = "wrong-mimetype" # formerly: "other-mimetype" |