From 5563cb5121c94efcf1819b915e7e7c602215a6e5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Oct 2022 14:22:17 -0700 Subject: ingest: another wall pattern, and check for walls in more places --- python/sandcrawler/ingest_file.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 6c72b96..03277f8 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -173,6 +173,7 @@ class IngestFileWorker(SandcrawlerWorker): "jstage.jst.go.jp/sblogin", "://acw.elsevier.com/SSOCore", "://acw.sciencedirect.com/SSOCore", + "/login?source=", ] self.cookie_blocklist = [ @@ -625,7 +626,7 @@ class IngestFileWorker(SandcrawlerWorker): result["status"] = "skip-url-blocklist" return result - # check against known loginwall URLs + # also check against known loginwall patterns for block in self.wall_blocklist: if block in next_url: # TODO: blocked-wall instead of skip-wall @@ -758,6 +759,12 @@ class IngestFileWorker(SandcrawlerWorker): result["extract_next_hop"] = fulltext_url if not fulltext_url: + # check if we hit a paywall/loginwall + for block in self.wall_blocklist: + if block in resource.terminal_url: + result["status"] = "blocked-wall" + return result + # else, just failed to find link result["status"] = "no-pdf-link" return result next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or "" @@ -837,6 +844,12 @@ class IngestFileWorker(SandcrawlerWorker): if resource.revisit_cdx: result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx) + # check if we hit a paywall/loginwall before trying mimetype + for block in self.wall_blocklist: + if block in resource.terminal_url: + result["status"] = "blocked-wall" + return result + if ingest_type == "pdf": if file_meta["mimetype"] != "application/pdf": result["status"] = "wrong-mimetype" # formerly: "other-mimetype" -- cgit v1.2.3