diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-27 17:07:44 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-27 17:07:44 -0800 |
commit | 137a3334729682ae3d8805bd130d3464ed3e1bb8 (patch) | |
tree | 6baed2f79d467626cecbec24a5617dcffd063758 /python | |
parent | 2069332aa58bd7d5804639a1adec3cde0118a5b0 (diff) | |
download | sandcrawler-137a3334729682ae3d8805bd130d3464ed3e1bb8.tar.gz sandcrawler-137a3334729682ae3d8805bd130d3464ed3e1bb8.zip |
more mime normalization
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/misc.py | 19 |
1 files changed, 18 insertions, 1 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 5713199..88669e6 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -51,10 +51,11 @@ NORMAL_MIME = ( 'application/postscript', 'text/html', 'text/xml', + 'application/octet-stream', ) def normalize_mime(raw): - raw = raw.lower() + raw = raw.lower().strip() for norm in NORMAL_MIME: if raw.startswith(norm): return norm @@ -64,6 +65,20 @@ def normalize_mime(raw): return 'text/xml' if raw.startswith('application/x-pdf'): return 'application/pdf' + if raw in ( + '.pdf', + ): + return 'application/pdf' + if raw in ( + 'application/download', + 'binary/octet-stream', + 'unk', + 'application/x-download', + 'application/octetstream', + 'application/force-download', + 'application/unknown', + ): + return 'application/octet-stream' return None @@ -76,6 +91,8 @@ def test_normalize_mime(): assert normalize_mime("application/xml+stuff") == "text/xml" assert normalize_mime("application/x-pdf") == "application/pdf" assert normalize_mime("application/x-html") is None + assert normalize_mime("unk") == "application/octet-stream" + assert normalize_mime("binary/octet-stream") == "application/octet-stream" def parse_cdx_line(raw_cdx, normalize=True): |