diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-24 12:28:51 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-24 12:28:51 -0700 |
commit | 3782311e29b7e477e1936c89f55ff6483fd02e65 (patch) | |
tree | b4484b9839f24d799d36881dfc85701ad888b94e /mapreduce/tests/test_common.py | |
parent | 2a998189ef49976bf01cc95acc1f18a73e1f0ff6 (diff) | |
download | sandcrawler-3782311e29b7e477e1936c89f55ff6483fd02e65.tar.gz sandcrawler-3782311e29b7e477e1936c89f55ff6483fd02e65.zip |
rename ./mapreduce to ./python
Diffstat (limited to 'mapreduce/tests/test_common.py')
-rw-r--r-- | mapreduce/tests/test_common.py | 40 |
1 files changed, 0 insertions, 40 deletions
diff --git a/mapreduce/tests/test_common.py b/mapreduce/tests/test_common.py deleted file mode 100644 index 34d50ed..0000000 --- a/mapreduce/tests/test_common.py +++ /dev/null @@ -1,40 +0,0 @@ - -from common import * - - -def test_parse_cdx_line(): - - raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" - correct = { - 'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", - 'file:mime': "application/pdf", - 'file:cdx': { - 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", - 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", - 'dt': "20170828233154", - 'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", - 'offset': 931661233, - 'c_size': 210251, - }, - 'f:c': { - 'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", - 'd': "2017-08-28T23:31:54", - 'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", - 'o': 931661233, - 'c': 1, - } - } - - assert parse_cdx_line(raw) == correct - assert parse_cdx_line(raw + "\n") == correct - assert parse_cdx_line(raw + " extra_field") == correct - -def test_invalid_cdx(): - - print("missing warc") - raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -" - assert parse_cdx_line(raw) == None - - print("bad datetime") - raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" - assert parse_cdx_line(raw) == None |