diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-10-24 14:22:17 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-10-24 14:22:17 -0700 |
commit | 5563cb5121c94efcf1819b915e7e7c602215a6e5 (patch) | |
tree | 326eab69f82df4209dc398841151d239507a4690 | |
parent | 4f0d10f4b38534eda673a8dfe28e3a58af9a8a8a (diff) | |
download | sandcrawler-5563cb5121c94efcf1819b915e7e7c602215a6e5.tar.gz sandcrawler-5563cb5121c94efcf1819b915e7e7c602215a6e5.zip |
ingest: another wall pattern, and check for walls in more places
-rw-r--r-- | python/sandcrawler/ingest_file.py | 15 |
1 files changed, 14 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 6c72b96..03277f8 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -173,6 +173,7 @@ class IngestFileWorker(SandcrawlerWorker): "jstage.jst.go.jp/sblogin", "://acw.elsevier.com/SSOCore", "://acw.sciencedirect.com/SSOCore", + "/login?source=", ] self.cookie_blocklist = [ @@ -625,7 +626,7 @@ class IngestFileWorker(SandcrawlerWorker): result["status"] = "skip-url-blocklist" return result - # check against known loginwall URLs + # also check against known loginwall patterns for block in self.wall_blocklist: if block in next_url: # TODO: blocked-wall instead of skip-wall @@ -758,6 +759,12 @@ class IngestFileWorker(SandcrawlerWorker): result["extract_next_hop"] = fulltext_url if not fulltext_url: + # check if we hit a paywall/loginwall + for block in self.wall_blocklist: + if block in resource.terminal_url: + result["status"] = "blocked-wall" + return result + # else, just failed to find link result["status"] = "no-pdf-link" return result next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or "" @@ -837,6 +844,12 @@ class IngestFileWorker(SandcrawlerWorker): if resource.revisit_cdx: result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx) + # check if we hit a paywall/loginwall before trying mimetype + for block in self.wall_blocklist: + if block in resource.terminal_url: + result["status"] = "blocked-wall" + return result + if ingest_type == "pdf": if file_meta["mimetype"] != "application/pdf": result["status"] = "wrong-mimetype" # formerly: "other-mimetype" |