diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 17:19:32 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 17:19:32 -0800 |
commit | 5b7f613f77c5bc77f071bcb7cc975c5f4dd02c87 (patch) | |
tree | 8b26a0df650011f8822dc3e992fb1ab40f6ff5bc | |
parent | f916655ab949ee11b3aa6bc84bb3b2118b0748d0 (diff) | |
download | sandcrawler-5b7f613f77c5bc77f071bcb7cc975c5f4dd02c87.tar.gz sandcrawler-5b7f613f77c5bc77f071bcb7cc975c5f4dd02c87.zip |
hack/workaround for protocols.io octet PDFs
-rw-r--r-- | python/sandcrawler/ingest.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 12d1473..ce3b75e 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -253,8 +253,10 @@ class IngestFileWorker(SandcrawlerWorker): return result if not (resource.hit and file_meta['mimetype'] == "application/pdf"): - result['status'] = "wrong-mimetype" # formerly: "other-mimetype" - return result + # protocols.io PDFs are "application/octet-stream" + if not (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url): + result['status'] = "wrong-mimetype" # formerly: "other-mimetype" + return result info = self.process_hit(resource, file_meta) result.update(info) |