diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-09-25 17:51:07 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-25 17:51:07 -0700 | 
| commit | d7830b4a5aad0a59a588e98798711f0e694d50d6 (patch) | |
| tree | 7565cbec74584a146b8ee79bb881fa9f78851f60 /python_hadoop/tests/test_common.py | |
| parent | 6e24eec4b6d1861eba37a0a05220b257e829ebbb (diff) | |
| download | sandcrawler-d7830b4a5aad0a59a588e98798711f0e694d50d6.tar.gz sandcrawler-d7830b4a5aad0a59a588e98798711f0e694d50d6.zip  | |
refactor old python hadoop code into new directory
Diffstat (limited to 'python_hadoop/tests/test_common.py')
| -rw-r--r-- | python_hadoop/tests/test_common.py | 40 | 
1 files changed, 40 insertions, 0 deletions
diff --git a/python_hadoop/tests/test_common.py b/python_hadoop/tests/test_common.py new file mode 100644 index 0000000..34d50ed --- /dev/null +++ b/python_hadoop/tests/test_common.py @@ -0,0 +1,40 @@ + +from common import * + + +def test_parse_cdx_line(): + +    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" +    correct = { +        'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", +        'file:mime': "application/pdf", +        'file:cdx': { +            'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", +            'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", +            'dt': "20170828233154", +            'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", +            'offset': 931661233, +            'c_size': 210251, +        }, +        'f:c': { +            'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", +            'd': "2017-08-28T23:31:54", +            'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", +            'o': 931661233, +            'c': 1, +        } +    } + +    assert parse_cdx_line(raw) == correct +    assert parse_cdx_line(raw + "\n") == correct +    assert parse_cdx_line(raw + " extra_field") == correct + +def test_invalid_cdx(): + +    print("missing warc") +    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -" +    assert parse_cdx_line(raw) == None + +    print("bad datetime") +    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"  +    assert parse_cdx_line(raw) == None  | 
