diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-02-27 17:07:44 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-27 17:07:44 -0800 | 
| commit | 137a3334729682ae3d8805bd130d3464ed3e1bb8 (patch) | |
| tree | 6baed2f79d467626cecbec24a5617dcffd063758 /python | |
| parent | 2069332aa58bd7d5804639a1adec3cde0118a5b0 (diff) | |
| download | sandcrawler-137a3334729682ae3d8805bd130d3464ed3e1bb8.tar.gz sandcrawler-137a3334729682ae3d8805bd130d3464ed3e1bb8.zip | |
more mime normalization
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/misc.py | 19 | 
1 files changed, 18 insertions, 1 deletions
| diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 5713199..88669e6 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -51,10 +51,11 @@ NORMAL_MIME = (      'application/postscript',      'text/html',      'text/xml', +    'application/octet-stream',  )  def normalize_mime(raw): -    raw = raw.lower() +    raw = raw.lower().strip()      for norm in NORMAL_MIME:          if raw.startswith(norm):              return norm @@ -64,6 +65,20 @@ def normalize_mime(raw):          return 'text/xml'      if raw.startswith('application/x-pdf'):          return 'application/pdf' +    if raw in ( +            '.pdf', +            ): +        return 'application/pdf' +    if raw in ( +            'application/download', +            'binary/octet-stream', +            'unk', +            'application/x-download', +            'application/octetstream', +            'application/force-download', +            'application/unknown', +            ): +        return 'application/octet-stream'      return None @@ -76,6 +91,8 @@ def test_normalize_mime():      assert normalize_mime("application/xml+stuff") == "text/xml"      assert normalize_mime("application/x-pdf") == "application/pdf"      assert normalize_mime("application/x-html") is None +    assert normalize_mime("unk") == "application/octet-stream" +    assert normalize_mime("binary/octet-stream") == "application/octet-stream"  def parse_cdx_line(raw_cdx, normalize=True): | 
