1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
import pytest
from sandcrawler import (
b32_hex,
clean_url,
gen_file_metadata,
gen_file_metadata_path,
parse_cdx_line,
)
def test_gen_file_metadata():
# valid (but very small) PDF file
with open("tests/files/dummy.pdf", "rb") as f:
file_meta = gen_file_metadata(f.read())
assert file_meta == {
"mimetype": "application/pdf",
"md5hex": "2942bfabb3d05332b66eb128e0842cff",
"sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
"sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
"size_bytes": 13264,
}
# valid HTML
fm = gen_file_metadata(
b"""<html><head><title>dummy</title></head><body>html document</body></html>"""
)
assert fm["mimetype"] == "text/html"
# bogus text
fm = gen_file_metadata(b"asdf1234")
assert fm["mimetype"] == "text/plain"
assert fm["size_bytes"] == 8
def test_gen_file_metadata_path():
# valid (but very small) PDF file
file_meta = gen_file_metadata_path("tests/files/dummy.pdf")
assert file_meta == {
"mimetype": "application/pdf",
"md5hex": "2942bfabb3d05332b66eb128e0842cff",
"sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
"sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
"size_bytes": 13264,
}
def test_b32_hex():
# valid b32
assert (
b32_hex("sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
== "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
)
assert (
b32_hex("TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
== "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
)
# sha1hex pass-through
s = "bda3c1017d52e826bbd1da51efad877272d300f9"
assert b32_hex(s) == s
# invalid
with pytest.raises(ValueError):
assert b32_hex("blah") == "blah"
def test_parse_cdx_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
correct = {
"sha1b32": "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
"sha1hex": "b2f65203da9929c2f758e8dd587b5524f904dbe6",
"mimetype": "application/pdf",
"surt": "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
"url": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
"datetime": "20170828233154",
"warc_path": "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
"warc_offset": 931661233,
"warc_csize": 210251,
"http_status": 200,
}
assert parse_cdx_line(raw) == correct
assert parse_cdx_line(raw + "\n") == correct
assert parse_cdx_line(raw + " extra_field") == correct
def test_invalid_cdx():
print("missing warc")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
assert parse_cdx_line(raw) is None
print("bad datetime")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
assert parse_cdx_line(raw) is None
def test_clean_url():
assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
assert (
clean_url(
"https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view"
)
== "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
)
|