aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/tests/test_common.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-24 12:28:51 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-24 12:28:51 -0700
commit3782311e29b7e477e1936c89f55ff6483fd02e65 (patch)
treeb4484b9839f24d799d36881dfc85701ad888b94e /mapreduce/tests/test_common.py
parent2a998189ef49976bf01cc95acc1f18a73e1f0ff6 (diff)
downloadsandcrawler-3782311e29b7e477e1936c89f55ff6483fd02e65.tar.gz
sandcrawler-3782311e29b7e477e1936c89f55ff6483fd02e65.zip
rename ./mapreduce to ./python
Diffstat (limited to 'mapreduce/tests/test_common.py')
-rw-r--r--mapreduce/tests/test_common.py40
1 files changed, 0 insertions, 40 deletions
diff --git a/mapreduce/tests/test_common.py b/mapreduce/tests/test_common.py
deleted file mode 100644
index 34d50ed..0000000
--- a/mapreduce/tests/test_common.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-from common import *
-
-
-def test_parse_cdx_line():
-
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- correct = {
- 'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
- 'file:mime': "application/pdf",
- 'file:cdx': {
- 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'dt': "20170828233154",
- 'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'offset': 931661233,
- 'c_size': 210251,
- },
- 'f:c': {
- 'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'd': "2017-08-28T23:31:54",
- 'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'o': 931661233,
- 'c': 1,
- }
- }
-
- assert parse_cdx_line(raw) == correct
- assert parse_cdx_line(raw + "\n") == correct
- assert parse_cdx_line(raw + " extra_field") == correct
-
-def test_invalid_cdx():
-
- print("missing warc")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
- assert parse_cdx_line(raw) == None
-
- print("bad datetime")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- assert parse_cdx_line(raw) == None