aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-10 17:19:32 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-10 17:19:32 -0800
commit5b7f613f77c5bc77f071bcb7cc975c5f4dd02c87 (patch)
tree8b26a0df650011f8822dc3e992fb1ab40f6ff5bc
parentf916655ab949ee11b3aa6bc84bb3b2118b0748d0 (diff)
downloadsandcrawler-5b7f613f77c5bc77f071bcb7cc975c5f4dd02c87.tar.gz
sandcrawler-5b7f613f77c5bc77f071bcb7cc975c5f4dd02c87.zip
hack/workaround for protocols.io octet PDFs
-rw-r--r--python/sandcrawler/ingest.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 12d1473..ce3b75e 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -253,8 +253,10 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if not (resource.hit and file_meta['mimetype'] == "application/pdf"):
- result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
- return result
+ # protocols.io PDFs are "application/octet-stream"
+ if not (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url):
+ result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
info = self.process_hit(resource, file_meta)
result.update(info)