diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 17:18:52 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 17:18:52 -0700 |
commit | 77577da13afe07b5177452122f4cee77e3357b4e (patch) | |
tree | d37e411121abb805c272f48a6f4579e7af650bb1 /mapreduce/tests/test_common.py | |
parent | 37a775851b2d21f2afe7418a8628c50ae37edc5b (diff) | |
download | sandcrawler-77577da13afe07b5177452122f4cee77e3357b4e.tar.gz sandcrawler-77577da13afe07b5177452122f4cee77e3357b4e.zip |
improve test coverage
Diffstat (limited to 'mapreduce/tests/test_common.py')
-rw-r--r-- | mapreduce/tests/test_common.py | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/mapreduce/tests/test_common.py b/mapreduce/tests/test_common.py index e2f96bb..34d50ed 100644 --- a/mapreduce/tests/test_common.py +++ b/mapreduce/tests/test_common.py @@ -28,3 +28,13 @@ def test_parse_cdx_line(): assert parse_cdx_line(raw) == correct assert parse_cdx_line(raw + "\n") == correct assert parse_cdx_line(raw + " extra_field") == correct + +def test_invalid_cdx(): + + print("missing warc") + raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -" + assert parse_cdx_line(raw) == None + + print("bad datetime") + raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" + assert parse_cdx_line(raw) == None |