aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-10-24 14:22:17 -0700
committerBryan Newbold <bnewbold@archive.org>2022-10-24 14:22:17 -0700
commit5563cb5121c94efcf1819b915e7e7c602215a6e5 (patch)
tree326eab69f82df4209dc398841151d239507a4690
parent4f0d10f4b38534eda673a8dfe28e3a58af9a8a8a (diff)
downloadsandcrawler-5563cb5121c94efcf1819b915e7e7c602215a6e5.tar.gz
sandcrawler-5563cb5121c94efcf1819b915e7e7c602215a6e5.zip
ingest: another wall pattern, and check for walls in more places
-rw-r--r--python/sandcrawler/ingest_file.py15
1 files changed, 14 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 6c72b96..03277f8 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -173,6 +173,7 @@ class IngestFileWorker(SandcrawlerWorker):
"jstage.jst.go.jp/sblogin",
"://acw.elsevier.com/SSOCore",
"://acw.sciencedirect.com/SSOCore",
+ "/login?source=",
]
self.cookie_blocklist = [
@@ -625,7 +626,7 @@ class IngestFileWorker(SandcrawlerWorker):
result["status"] = "skip-url-blocklist"
return result
- # check against known loginwall URLs
+ # also check against known loginwall patterns
for block in self.wall_blocklist:
if block in next_url:
# TODO: blocked-wall instead of skip-wall
@@ -758,6 +759,12 @@ class IngestFileWorker(SandcrawlerWorker):
result["extract_next_hop"] = fulltext_url
if not fulltext_url:
+ # check if we hit a paywall/loginwall
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+ # else, just failed to find link
result["status"] = "no-pdf-link"
return result
next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or ""
@@ -837,6 +844,12 @@ class IngestFileWorker(SandcrawlerWorker):
if resource.revisit_cdx:
result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+ # check if we hit a paywall/loginwall before trying mimetype
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+
if ingest_type == "pdf":
if file_meta["mimetype"] != "application/pdf":
result["status"] = "wrong-mimetype" # formerly: "other-mimetype"