aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/Pipfile4
-rw-r--r--python/Pipfile.lock438
-rwxr-xr-xpython/ingest_file.py7
-rwxr-xr-xpython/persist_tool.py4
-rw-r--r--python/sandcrawler/__init__.py2
-rw-r--r--python/sandcrawler/ia.py15
-rw-r--r--python/sandcrawler/ingest.py24
-rw-r--r--python/sandcrawler/misc.py7
-rw-r--r--python/sandcrawler/persist.py17
-rw-r--r--python/sandcrawler/workers.py6
-rwxr-xr-xpython/sandcrawler_worker.py4
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py2
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py2
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py2
-rw-r--r--python/tests/test_misc.py8
15 files changed, 313 insertions, 229 deletions
diff --git a/python/Pipfile b/python/Pipfile
index f7e59c5..fc63697 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -34,6 +34,10 @@ python-magic = "*"
ftfy = "*"
internetarchive = "*"
Flask = ">=1"
+urlcanon = "*"
+
+# this is only to lock to a python3.5-compatible version
+zipp = "<2.0.0"
[requires]
python_version = "3.5"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 4de99d5..07e7484 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "87b092df5b5472b2b42e8d4c42e73319968c36729cc1bd3fb6de0eb3346f2a6b"
+ "sha256": "ac2202b1be9e35ebd698dd609af8d868afb3fdd0134c5581925138b51d4f475f"
},
"pipfile-spec": 6,
"requires": {
@@ -21,6 +21,22 @@
]
},
"default": {
+ "aerospike": {
+ "hashes": [
+ "sha256:3c3edb9c59491100cf5f9b0d802ee0b812b32b626c16358133cf5b9931ab8620",
+ "sha256:42e6ed4f6298aab4e5094f45a69fc805f925fbaa4ec206a87ce0a2048df02d4d",
+ "sha256:67684fb6af531765eb6061e37597bc73a348a2eff141795447ab20d9c6a61289",
+ "sha256:6aec5e0dbedb8ddd97441abaebedb04d4abbd51bfcfd6f0a6722fabc5be4efd0",
+ "sha256:9280ecb0257b0b706df7ac934dc03f518641934479d9c925a46af5231fb65f40",
+ "sha256:98779725a86ef345b9fec0b5ef60b59b2430b9c8c8e8904adb7945af6d6f9ffb",
+ "sha256:99de79a26f184a47a67123899e093cecd5c3bc0b0ce92da4f302684ad0b0116c",
+ "sha256:b170b637d69f49c02d021477359866c3d89a2c0d1477bec19343828f890d3cb1",
+ "sha256:d2f0b0288e2efafb99bbada6b39714285f317dc47fde3c4458b76e8cfbb71c11",
+ "sha256:d83faa27d40af320058a93902e19173f6295acbcc9ca225c552d7648169859f0",
+ "sha256:db7efad41300cb9bd6e70534c3110fce4e474db6d4288428609e0214a021aab8"
+ ],
+ "version": "==3.10.0"
+ },
"args": {
"hashes": [
"sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814"
@@ -51,19 +67,18 @@
},
"boto3": {
"hashes": [
- "sha256:5c00d51101d6a7ddf2207ae8a738e5c815c5fcffbee76121f38bd41d83c936a5",
- "sha256:aa58c8de6aed36211e0897598de2a3d89122ad8cd1450165679720180ab880ef"
+ "sha256:5e145fa27b56c6f3db20c20d4e0084e146a400007064af8d46c1b3bfcc779c42",
+ "sha256:95d2fb21e3f0575f8b599706100c42a5a7ae576671f6f4beea6e42ab0f57166b"
],
"index": "ia",
- "version": "==1.10.50"
+ "version": "==1.12.18"
},
"botocore": {
"hashes": [
- "sha256:765a5c637ff792239727c327b221ed5a4d851e9f176ce8b8b9eca536425c74d4",
- "sha256:adb4cb188cd0866e7337f9a049fc68db042b0340fd496d40bca349c8dbfc6a2d"
+ "sha256:2f1a54e19531bdf2d953c2db5f76b49c6936565366b2503a528b04cbbf55671a",
+ "sha256:9827c6f3bffb9e316427c276e3f22e5d82b377dd43dcbbbe71ecbd9ac2b959ab"
],
- "markers": "python_version >= '2.6' and python_version != '3.1.*' and python_version != '3.0.*'",
- "version": "==1.13.50"
+ "version": "==1.15.18"
},
"brotli": {
"hashes": [
@@ -126,11 +141,10 @@
},
"click": {
"hashes": [
- "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
- "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
+ "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc",
+ "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a"
],
- "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
- "version": "==7.0"
+ "version": "==7.1.1"
},
"clint": {
"hashes": [
@@ -138,6 +152,13 @@
],
"version": "==0.5.1"
},
+ "configparser": {
+ "hashes": [
+ "sha256:254c1d9c79f60c45dfde850850883d5aaa7f19a23f13561243a050d5a7c3fe4c",
+ "sha256:c7d282687a5308319bf3d2e7706e575c635b0a470342641c93bea0ea3b5331df"
+ ],
+ "version": "==4.0.2"
+ },
"confluent-kafka": {
"hashes": [
"sha256:0efd716da4f03f99d45fbb0d1583c5c8bf1eabc258a883588e3cd6ee06c0facb",
@@ -187,31 +208,28 @@
"hashes": [
"sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea"
],
- "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'",
"version": "==0.1.4.8"
},
"dawg": {
"hashes": [
- "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5",
- "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953",
- "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e",
- "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8",
- "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3",
- "sha256:7d0a904e91adfa3de7071bfe64cd1334ce4040f1795cca8c13598bd075e72e18",
- "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55",
- "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9",
- "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171",
- "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9"
+ "sha256:28c4c934ab1ca74226a46e6213f919f5b0912bf9de87218264d4d94c15521753",
+ "sha256:34881e06278d4a54cf0b402c0c8b587bef0caa78f0eee595adc7a2aa530e48ce",
+ "sha256:73760ad1272b1b47997f1a768b8f3bf547c92475bcd62185f4ab7e1bc691964e",
+ "sha256:7aecc4c89243edaf1efe7a4d769d993a7cd9307a8a04f48e07c4fc7c44bdd38f",
+ "sha256:83ce4a73f7632b0ed31af16c2750533ecbed347bad1148a52f6436e348b5b7ac",
+ "sha256:d78929f5a7f7e083f5720992068535d133f0d3326f0c677c61c59256aa43d95e",
+ "sha256:e664a884ca48f2599ad5c2289d9b7f769e77d266560c79992e3db2cfce96cb1b",
+ "sha256:fb90b799fb7d6d728531840529c812a9ee17736da71e8a596ede8bfd6c62bf36",
+ "sha256:feb6073e0d02ac54389ad378e6c695e28fe579e2772c225a854299752effece6"
],
- "version": "==0.7.8"
+ "version": "==0.8.0"
},
"decorator": {
"hashes": [
- "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce",
- "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d"
+ "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
+ "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==4.4.1"
+ "version": "==4.4.2"
},
"docopt": {
"hashes": [
@@ -225,14 +243,12 @@
"sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827",
"sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99"
],
- "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6'",
"version": "==0.15.2"
},
"dogpile.cache": {
"hashes": [
"sha256:b348835825c9dcd251d9aad1f89f257277ac198a3e35a61980ab4cb28c75216b"
],
- "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'",
"version": "==0.9.0"
},
"elasticsearch": {
@@ -240,7 +256,6 @@
"sha256:1f0f633e3b500d5042424f75a505badf8c4b9962c1b4734cdfb3087fb67920be",
"sha256:fb5ab15ee283f104b5a7a5695c7e879cb2927e4eb5aed9c530811590b41259ad"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version < '4' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'",
"version": "==6.4.0"
},
"flask": {
@@ -253,23 +268,22 @@
},
"ftfy": {
"hashes": [
- "sha256:6d7509c45e602dec890f0f6ee0623a8b5f50ec1188ac7ab9535e18e572c99bcc"
+ "sha256:67f9c8b33a4b742376a3eda11b0e3bd5c0cbe719d95ea0bfd3736a7bdd1c24c8"
],
"index": "ia",
- "version": "==5.6"
+ "version": "==5.7"
},
"globalwayback": {
"hashes": [
- "sha256:ddd1fac7caad4181e8e623cb67ef3f6a6f7c0f306140c450b92f8bb3032aba51"
+ "sha256:6f7bd270ef827c1d8c1d5631c313dbf401e300993b593179699741fcdbc5295d"
],
"index": "ia",
- "version": "==0.4.13"
+ "version": "==0.5.2"
},
"ialib": {
"hashes": [
"sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc"
],
- "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
"version": "==0.3.0.1"
},
"idna": {
@@ -292,38 +306,34 @@
"sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
"sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
],
- "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
"version": "==1.1.0"
},
"jinja2": {
"hashes": [
- "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f",
- "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de"
+ "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250",
+ "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==2.10.3"
+ "version": "==2.11.1"
},
"jmespath": {
"hashes": [
- "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6",
- "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c"
+ "sha256:695cb76fa78a10663425d5b73ddc5714eb711157e52704d69be03b1a02ba4fec",
+ "sha256:cca55c8d153173e21baa59983015ad0daf603f9cb799904ff057bfb8ff8dc2d9"
],
- "version": "==0.9.4"
+ "version": "==0.9.5"
},
"jsonpatch": {
"hashes": [
- "sha256:83f29a2978c13da29bfdf89da9d65542d62576479caf215df19632d7dc04c6e6",
- "sha256:cbb72f8bf35260628aea6b508a107245f757d1ec839a19c34349985e2c05645a"
+ "sha256:cc3a7241010a1fd3f50145a3b33be2c03c1e679faa19934b628bb07d0f64819e",
+ "sha256:ddc0f7628b8bfdd62e3cbfbc24ca6671b0b6265b50d186c2cf3659dc0f78fd6a"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==1.24"
+ "version": "==1.25"
},
"jsonpointer": {
"hashes": [
"sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362",
"sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
"version": "==2.0"
},
"kazoo": {
@@ -331,7 +341,6 @@
"sha256:8db774f7bdece7d0dc7decb21539ff0852e42c2ffe1c28d7f1ff6f9292a1c3a4",
"sha256:a5fa2e400c5068cfee9e86b35cf0dab8232b574152d8e3590d823b3e2426ab5e"
],
- "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
"version": "==2.5.0"
},
"markupsafe": {
@@ -340,13 +349,16 @@
"sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
"sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
"sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
+ "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
"sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
"sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
"sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
"sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
"sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
"sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
+ "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
"sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
+ "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
"sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
"sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
"sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
@@ -363,42 +375,68 @@
"sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
"sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
"sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
- "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"
+ "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
+ "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
+ "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
"version": "==1.1.1"
},
"minio": {
"hashes": [
- "sha256:daef713dfaaa232a719c4dc58f0bed90b6d11ab14e94cd013cef7e7ed5b2cfdc",
- "sha256:e5dc4670fe5c3e0ef61e9556bbae4768fcb051f3deb73d89e40a1b470dc9bd88"
+ "sha256:7543f990231b0d605f35b9140ec51cdb6335e741bb8c45dea9b8746c248a54bf",
+ "sha256:bf9b5001273108864e09a330a0a1795c6caee6036a124a607916830544afbf5f",
+ "sha256:f1987811525120d68a420fc6142ca310df174123da56742233824342e3a8c3f8"
],
"index": "ia",
- "version": "==5.0.6"
+ "version": "==5.0.7"
},
"pillow": {
"hashes": [
+ "sha256:02d2ae1d7c311e6fa038abda0843683ae652c9292d723270c85deeb04a1001a8",
"sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b",
+ "sha256:1937c9e17f685fe6c360dd96ddb8f93f159ac721939ccbfc91a62d8124a29945",
+ "sha256:1ce73edaeb49af9ebeacfb8c58428ae39592839d3a7a16ef3926773f1c8ff8ee",
+ "sha256:2052f0372123c98497ee3294f4e20347d87b9f70d9c65ce2fc520b9339aa8465",
"sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe",
+ "sha256:26b4a2bcdf0e674505fcd2f1a882f29a99339ac3b5a8e7997b90ede2995434e2",
+ "sha256:282e069f92e43047b34bbd995a8800669af11d038db571758708ebcd96462964",
+ "sha256:2ee30463cbc8e60cba92722a3a64881a0d3df534a047e299a9bcf62ea34fd061",
"sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b",
"sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608",
"sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac",
"sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389",
+ "sha256:4756b77682c7335751a2cfc0e9e6d96945d88ffd315420d9010235021ddfc64c",
"sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff",
+ "sha256:502275e7a83872e62e0cd0be4da575a53a1f9703341aba814527dacdda3660a8",
"sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f",
+ "sha256:5bcea0df97fe0b911a6629aab0997b98e8811561c27167266758a7ede173123e",
"sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b",
+ "sha256:61cb7e1ff212293d74155397fad008b052bc9633efdf9caa4271c316cd25b99f",
+ "sha256:68e45ed91531d3b05a17a356fd4cb928603a00259916e057730a024b029d6b51",
+ "sha256:69c3cc797a66241ed2fa61ff6f52c73e7bd3e738d80d64abb3500e6fbdec30a3",
+ "sha256:69ec2f0effe8b395e55929bb4d9a3ca8ff0c40f85d61d00e1e5b1e504d28b5e2",
"sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3",
"sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05",
+ "sha256:82351254350d9212bccf71d387efae8ad8f6f4b904d095546a77852a6b16e05b",
+ "sha256:95236f64904157256254b6cc8e29feecd9ee6985732dcb36c9f58d7dabe081d0",
"sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64",
"sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7",
"sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842",
+ "sha256:9e274583a0eab0b6d227139146e28f74488cfbc0d262c4ba2e5c0998b9c498d2",
"sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a",
+ "sha256:a10befeb7b9975d7c3d2ca3eaf0cb505db98fe50874130e182c2a6f7a606591f",
"sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3",
+ "sha256:b55f62882d8db466fcf2228422bf3147617744888bf0cf6dffb3254a52eb316b",
+ "sha256:bf83901c158ad92e77e990f51531434e5a96c6aef805a84b6e3bfe825f4d4d0c",
+ "sha256:c32f99a0c7c5313b2df78399ef908563b319de23bec0cc89f1d04c37be19eb2d",
"sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984",
+ "sha256:cf4b3c634b317ee247c3add4375b0a6bdc45eb0c12a5d7fbf9bfd47ec10b020f",
"sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3",
"sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c",
+ "sha256:e04df3808d6202dd552c837c824796899c09ff0ff9c335607904e31f9d387110",
"sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb",
"sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0",
+ "sha256:eae3711a7916eb5ec800dfb6963da09db0ada63c0481639dd0ddc0b505883a02",
"sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c",
"sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394"
],
@@ -431,6 +469,7 @@
},
"pykafka": {
"hashes": [
+ "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459",
"sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577"
],
"index": "ia",
@@ -454,7 +493,6 @@
"sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
"sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
],
- "markers": "python_version >= '2.7'",
"version": "==2.8.1"
},
"python-magic": {
@@ -497,7 +535,6 @@
"sha256:e3a057b7a64f1222b56e47bcff5e4b94c4f61faac04c7c4ecb1985e18caa3994",
"sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615"
],
- "markers": "python_version != '3.2.*' and python_version != '3.4.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'",
"version": "==5.3"
},
"raven": {
@@ -506,24 +543,22 @@
"sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
],
"index": "ia",
- "markers": null,
"version": "==6.10.0"
},
"redis": {
"hashes": [
- "sha256:3613daad9ce5951e426f460deddd5caf469e08a3af633e9578fc77d362becf62",
- "sha256:8d0fc278d3f5e1249967cba2eb4a5632d19e45ce5c09442b8422d15ee2c22cc2"
+ "sha256:0dcfb335921b88a850d461dc255ff4708294943322bd55de6cfd68972490ca1f",
+ "sha256:b205cffd05ebfd0a468db74f0eedbff8df1a7bfc47521516ade4692991bb0833"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==3.3.11"
+ "version": "==3.4.1"
},
"requests": {
"hashes": [
- "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
- "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
+ "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
],
"index": "ia",
- "version": "==2.22.0"
+ "version": "==2.23.0"
},
"requests-file": {
"hashes": [
@@ -540,11 +575,10 @@
},
"s3transfer": {
"hashes": [
- "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d",
- "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba"
+ "sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13",
+ "sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db"
],
- "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'",
- "version": "==0.2.1"
+ "version": "==0.3.3"
},
"schedule": {
"hashes": [
@@ -562,25 +596,23 @@
},
"six": {
"hashes": [
- "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd",
- "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"
+ "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
+ "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==1.13.0"
+ "version": "==1.14.0"
},
"soupsieve": {
"hashes": [
- "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5",
- "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda"
+ "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae",
+ "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69"
],
- "version": "==1.9.5"
+ "version": "==2.0"
},
"sqlalchemy": {
"hashes": [
- "sha256:bfb8f464a5000b567ac1d350b9090cf081180ec1ab4aa87e7bca12dab25320ec"
+ "sha256:b92d2de62e43499d85b1780274d1b562e5159c7996f6f04a9bb46cf681ced45f"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==1.3.12"
+ "version": "==1.3.14"
},
"surt": {
"hashes": [
@@ -600,7 +632,6 @@
"sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7",
"sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808"
],
- "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'",
"version": "==2.2.2"
},
"twitter": {
@@ -610,12 +641,18 @@
],
"version": "==1.18.0"
},
+ "urlcanon": {
+ "hashes": [
+ "sha256:30f5bf0e2e4a0feb6dd9ee139a4180a5d493117e8a1448569da3d73c18b92b62"
+ ],
+ "index": "ia",
+ "version": "==0.3.1"
+ },
"urllib3": {
"hashes": [
"sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
"sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
],
- "markers": "python_version >= '3.4'",
"version": "==1.22"
},
"warctools": {
@@ -626,10 +663,10 @@
},
"wayback": {
"hashes": [
- "sha256:3e89df1a3cb49baffe03572a77d00d97d54ccebeb4dd24f19d8f2b8ec3812ad3"
+ "sha256:936ae4c75af922e0f4b1bc82c66f51f97687cc6a1b8f3f1a19ec8fa7ab11ec41"
],
"index": "ia",
- "version": "==0.5.1"
+ "version": "==0.5.3"
},
"wayback-esp": {
"hashes": [
@@ -647,7 +684,6 @@
"hashes": [
"sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c"
],
- "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
"version": "==0.1.5"
},
"wcwidth": {
@@ -659,11 +695,18 @@
},
"werkzeug": {
"hashes": [
- "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7",
- "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4"
+ "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096",
+ "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==0.16.0"
+ "version": "==1.0.0"
+ },
+ "zipp": {
+ "hashes": [
+ "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1",
+ "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"
+ ],
+ "index": "ia",
+ "version": "==1.2.0"
}
},
"develop": {
@@ -674,20 +717,11 @@
],
"version": "==2.3.3"
},
- "atomicwrites": {
- "hashes": [
- "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
- "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
- ],
- "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
- "version": "==1.3.0"
- },
"attrs": {
"hashes": [
"sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
"sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"version": "==19.3.0"
},
"backcall": {
@@ -713,48 +747,46 @@
},
"coverage": {
"hashes": [
- "sha256:189aac76d6e0d7af15572c51892e7326ee451c076c5a50a9d266406cd6c49708",
- "sha256:1bf7ba2af1d373a1750888724f84cffdfc697738f29a353c98195f98fc011509",
- "sha256:1f4ee8e2e4243971618bc16fcc4478317405205f135e95226c2496e2a3b8dbbf",
- "sha256:225e79a5d485bc1642cb7ba02281419c633c216cdc6b26c26494ba959f09e69f",
- "sha256:23688ff75adfa8bfa2a67254d889f9bdf9302c27241d746e17547c42c732d3f4",
- "sha256:28f7f73b34a05e23758e860a89a7f649b85c6749e252eff60ebb05532d180e86",
- "sha256:2d0cb9b1fe6ad0d915d45ad3d87f03a38e979093a98597e755930db1f897afae",
- "sha256:47874b4711c5aeb295c31b228a758ce3d096be83dc37bd56da48ed99efb8813b",
- "sha256:511ec0c00840e12fb4e852e4db58fa6a01ca4da72f36a9766fae344c3d502033",
- "sha256:53e7438fef0c97bc248f88ba1edd10268cd94d5609970aaf87abbe493691af87",
- "sha256:569f9ee3025682afda6e9b0f5bb14897c0db03f1a1dc088b083dd36e743f92bb",
- "sha256:593853aa1ac6dcc6405324d877544c596c9d948ef20d2e9512a0f5d2d3202356",
- "sha256:5b0a07158360d22492f9abd02a0f2ee7981b33f0646bf796598b7673f6bbab14",
- "sha256:7ca3db38a61f3655a2613ee2c190d63639215a7a736d3c64cc7bbdb002ce6310",
- "sha256:7d1cc7acc9ce55179616cf72154f9e648136ea55987edf84addbcd9886ffeba2",
- "sha256:88b51153657612aea68fa684a5b88037597925260392b7bb4509d4f9b0bdd889",
- "sha256:955ec084f549128fa2702f0b2dc696392001d986b71acd8fd47424f28289a9c3",
- "sha256:b251c7092cbb6d789d62dc9c9e7c4fb448c9138b51285c36aeb72462cad3600e",
- "sha256:bd82b684bb498c60ef47bb1541a50e6d006dde8579934dcbdbc61d67d1ea70d9",
- "sha256:bfe102659e2ec13b86c7f3b1db6c9a4e7beea4255058d006351339e6b342d5d2",
- "sha256:c1e4e39e43057396a5e9d069bfbb6ffeee892e40c5d2effbd8cd71f34ee66c4d",
- "sha256:cb2b74c123f65e8166f7e1265829a6c8ed755c3cd16d7f50e75a83456a5f3fd7",
- "sha256:cca38ded59105f7705ef6ffe1e960b8db6c7d8279c1e71654a4775ab4454ca15",
- "sha256:cf908840896f7aa62d0ec693beb53264b154f972eb8226fb864ac38975590c4f",
- "sha256:d095a7b473f8a95f7efe821f92058c8a2ecfb18f8db6677ae3819e15dc11aaae",
- "sha256:d22b4297e7e4225ccf01f1aa55e7a96412ea0796b532dd614c3fcbafa341128e",
- "sha256:d4a2b578a7a70e0c71f662705262f87a456f1e6c1e40ada7ea699abaf070a76d",
- "sha256:ddeb42a3d5419434742bf4cc71c9eaa22df3b76808e23a82bd0b0bd360f1a9f1",
- "sha256:e65a5aa1670db6263f19fdc03daee1d7dbbadb5cb67fd0a1f16033659db13c1d",
- "sha256:eaad65bd20955131bcdb3967a4dea66b4e4d4ca488efed7c00d91ee0173387e8",
- "sha256:f45fba420b94165c17896861bb0e8b27fb7abdcedfeb154895d8553df90b7b00"
- ],
- "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version < '4' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'",
- "version": "==5.0.2"
+ "sha256:15cf13a6896048d6d947bf7d222f36e4809ab926894beb748fc9caa14605d9c3",
+ "sha256:1daa3eceed220f9fdb80d5ff950dd95112cd27f70d004c7918ca6dfc6c47054c",
+ "sha256:1e44a022500d944d42f94df76727ba3fc0a5c0b672c358b61067abb88caee7a0",
+ "sha256:25dbf1110d70bab68a74b4b9d74f30e99b177cde3388e07cc7272f2168bd1477",
+ "sha256:3230d1003eec018ad4a472d254991e34241e0bbd513e97a29727c7c2f637bd2a",
+ "sha256:3dbb72eaeea5763676a1a1efd9b427a048c97c39ed92e13336e726117d0b72bf",
+ "sha256:5012d3b8d5a500834783689a5d2292fe06ec75dc86ee1ccdad04b6f5bf231691",
+ "sha256:51bc7710b13a2ae0c726f69756cf7ffd4362f4ac36546e243136187cfcc8aa73",
+ "sha256:527b4f316e6bf7755082a783726da20671a0cc388b786a64417780b90565b987",
+ "sha256:722e4557c8039aad9592c6a4213db75da08c2cd9945320220634f637251c3894",
+ "sha256:76e2057e8ffba5472fd28a3a010431fd9e928885ff480cb278877c6e9943cc2e",
+ "sha256:77afca04240c40450c331fa796b3eab6f1e15c5ecf8bf2b8bee9706cd5452fef",
+ "sha256:7afad9835e7a651d3551eab18cbc0fdb888f0a6136169fbef0662d9cdc9987cf",
+ "sha256:9bea19ac2f08672636350f203db89382121c9c2ade85d945953ef3c8cf9d2a68",
+ "sha256:a8b8ac7876bc3598e43e2603f772d2353d9931709345ad6c1149009fd1bc81b8",
+ "sha256:b0840b45187699affd4c6588286d429cd79a99d509fe3de0f209594669bb0954",
+ "sha256:b26aaf69713e5674efbde4d728fb7124e429c9466aeaf5f4a7e9e699b12c9fe2",
+ "sha256:b63dd43f455ba878e5e9f80ba4f748c0a2156dde6e0e6e690310e24d6e8caf40",
+ "sha256:be18f4ae5a9e46edae3f329de2191747966a34a3d93046dbdf897319923923bc",
+ "sha256:c312e57847db2526bc92b9bfa78266bfbaabac3fdcd751df4d062cd4c23e46dc",
+ "sha256:c60097190fe9dc2b329a0eb03393e2e0829156a589bd732e70794c0dd804258e",
+ "sha256:c62a2143e1313944bf4a5ab34fd3b4be15367a02e9478b0ce800cb510e3bbb9d",
+ "sha256:cc1109f54a14d940b8512ee9f1c3975c181bbb200306c6d8b87d93376538782f",
+ "sha256:cd60f507c125ac0ad83f05803063bed27e50fa903b9c2cfee3f8a6867ca600fc",
+ "sha256:d513cc3db248e566e07a0da99c230aca3556d9b09ed02f420664e2da97eac301",
+ "sha256:d649dc0bcace6fcdb446ae02b98798a856593b19b637c1b9af8edadf2b150bea",
+ "sha256:d7008a6796095a79544f4da1ee49418901961c97ca9e9d44904205ff7d6aa8cb",
+ "sha256:da93027835164b8223e8e5af2cf902a4c80ed93cb0909417234f4a9df3bcd9af",
+ "sha256:e69215621707119c6baf99bda014a45b999d37602cb7043d943c76a59b05bf52",
+ "sha256:ea9525e0fef2de9208250d6c5aeeee0138921057cd67fcef90fbed49c4d62d37",
+ "sha256:fca1669d464f0c9831fd10be2eef6b86f5ebd76c724d1e0706ebdff86bb4adf0"
+ ],
+ "version": "==5.0.3"
},
"decorator": {
"hashes": [
- "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce",
- "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d"
+ "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
+ "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==4.4.1"
+ "version": "==4.4.2"
},
"idna": {
"hashes": [
@@ -765,11 +797,11 @@
},
"importlib-metadata": {
"hashes": [
- "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45",
- "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"
+ "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302",
+ "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b"
],
"markers": "python_version < '3.8'",
- "version": "==1.3.0"
+ "version": "==1.5.0"
},
"ipython": {
"hashes": [
@@ -791,15 +823,14 @@
"sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1",
"sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"version": "==4.3.21"
},
"jedi": {
"hashes": [
- "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064",
- "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807"
+ "sha256:b4f4052551025c6b0b0b193b29a6ff7bdb74c52450631206c262aef9f7159ad2",
+ "sha256:d5c871cb9360b414f981e7072c52c33258d598305280fef91c6cae34739d65d5"
],
- "version": "==0.15.2"
+ "version": "==0.16.0"
},
"lazy-object-proxy": {
"hashes": [
@@ -825,7 +856,6 @@
"sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4",
"sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"version": "==1.4.3"
},
"mccabe": {
@@ -837,26 +867,24 @@
},
"more-itertools": {
"hashes": [
- "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d",
- "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564"
+ "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c",
+ "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507"
],
- "markers": "python_version > '2.7'",
- "version": "==8.0.2"
+ "version": "==8.2.0"
},
"packaging": {
"hashes": [
- "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb",
- "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8"
+ "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3",
+ "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==20.0"
+ "version": "==20.3"
},
"parso": {
"hashes": [
- "sha256:55cf25df1a35fd88b878715874d2c4dc1ad3f0eebd1e0266a67e1f55efccfbe1",
- "sha256:5c1f7791de6bd5dbbeac8db0ef5594b36799de198b3f7f7014643b0c5536b9d3"
+ "sha256:0c5659e0c6eba20636f99a04f469798dca8da279645ce5c387315b2c23912157",
+ "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995"
],
- "version": "==0.5.2"
+ "version": "==0.6.2"
},
"pathlib2": {
"hashes": [
@@ -868,11 +896,11 @@
},
"pexpect": {
"hashes": [
- "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
- "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+ "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937",
+ "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"
],
"markers": "sys_platform != 'win32'",
- "version": "==4.7.0"
+ "version": "==4.8.0"
},
"pickleshare": {
"hashes": [
@@ -886,7 +914,6 @@
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
],
- "markers": "python_version >= '3.5'",
"version": "==0.13.1"
},
"prompt-toolkit": {
@@ -895,7 +922,6 @@
"sha256:dd4fca02c8069497ad931a2d09914c6b0d1b50151ce876bc15bde4c747090126",
"sha256:f7eec66105baf40eda9ab026cd8b2e251337eea8d111196695d82e0c5f0af852"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.6' and python_version != '3.0.*'",
"version": "==1.0.18"
},
"ptyprocess": {
@@ -910,16 +936,14 @@
"sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa",
"sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"version": "==1.8.1"
},
"pygments": {
"hashes": [
- "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b",
- "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe"
+ "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44",
+ "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324"
],
- "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'",
- "version": "==2.5.2"
+ "version": "==2.6.1"
},
"pylint": {
"hashes": [
@@ -934,16 +958,15 @@
"sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f",
"sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
"version": "==2.4.6"
},
"pytest": {
"hashes": [
- "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa",
- "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4"
+ "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d",
+ "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"
],
"index": "ia",
- "version": "==5.3.2"
+ "version": "==5.3.5"
},
"pytest-cov": {
"hashes": [
@@ -963,12 +986,11 @@
},
"pytest-pylint": {
"hashes": [
- "sha256:8c38ea779e540e27ec4378b0820d906006e09f4ac834defbd886abbf57c7d2ec",
- "sha256:a4f5e5007f88c2095dcac799e9f7eed3d7e7a2e657596e26093814980ff5fa20",
- "sha256:a574c246535308f8f6ceac10fa82f8fffffa837071f7985b22515895185700c1"
+ "sha256:cac5d565182f39fbb7fa7f4ef1bbcc979e8f5cc260450ec72dc5aafeb782531f",
+ "sha256:dd3e232da5703e7fd14c610247dbe25dfd8e3278069b4b8bcf9778ba06b77569"
],
"index": "ia",
- "version": "==0.14.1"
+ "version": "==0.15.1"
},
"pytest-pythonpath": {
"hashes": [
@@ -979,19 +1001,19 @@
},
"requests": {
"hashes": [
- "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
- "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
+ "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
],
"index": "ia",
- "version": "==2.22.0"
+ "version": "==2.23.0"
},
"responses": {
"hashes": [
- "sha256:515fd7c024097e5da76e9c4cf719083d181f1c3ddc09c2e0e49284ce863dd263",
- "sha256:8ce8cb4e7e1ad89336f8865af152e0563d2e7f0e0b86d2cf75f015f819409243"
+ "sha256:0474ce3c897fbcc1aef286117c93499882d5c440f06a805947e4b1cb5ab3d474",
+ "sha256:f83613479a021e233e82d52ffb3e2e0e2836d24b0cc88a0fa31978789f78d0e5"
],
"index": "ia",
- "version": "==0.10.9"
+ "version": "==0.10.12"
},
"simplegeneric": {
"hashes": [
@@ -1001,52 +1023,50 @@
},
"six": {
"hashes": [
- "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd",
- "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"
+ "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
+ "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==1.13.0"
+ "version": "==1.14.0"
},
"traitlets": {
"hashes": [
"sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44",
"sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
"version": "==4.3.3"
},
"typed-ast": {
"hashes": [
- "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161",
- "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e",
- "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e",
- "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0",
- "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c",
- "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47",
- "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631",
- "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4",
- "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34",
- "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b",
- "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2",
- "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e",
- "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a",
- "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233",
- "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1",
- "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36",
- "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d",
- "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a",
- "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66",
- "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12"
+ "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
+ "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
+ "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
+ "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
+ "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
+ "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
+ "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
+ "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
+ "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
+ "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
+ "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
+ "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
+ "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
+ "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
+ "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
+ "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
+ "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
+ "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
+ "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
+ "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
+ "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
],
"markers": "implementation_name == 'cpython' and python_version < '3.8'",
- "version": "==1.4.0"
+ "version": "==1.4.1"
},
"urllib3": {
"hashes": [
"sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
"sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
],
- "markers": "python_version >= '3.4'",
"version": "==1.22"
},
"wcwidth": {
@@ -1064,11 +1084,11 @@
},
"zipp": {
"hashes": [
- "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e",
- "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335"
+ "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1",
+ "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"
],
- "markers": "python_version >= '3.5'",
- "version": "==0.6.0"
+ "index": "ia",
+ "version": "==1.2.0"
}
}
}
diff --git a/python/ingest_file.py b/python/ingest_file.py
index d4fdcac..f6f694e 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -17,7 +17,9 @@ def run_single_ingest(args):
)
if args.force_recrawl:
request['force_recrawl'] = True
- ingester = IngestFileWorker()
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ )
result = ingester.process(request)
print(json.dumps(result, sort_keys=True))
return result
@@ -51,6 +53,9 @@ def main():
sub_single.add_argument('--force-recrawl',
action='store_true',
help="ignore GWB history and use SPNv2 to re-crawl")
+ sub_single.add_argument('--no-spn2',
+ action='store_true',
+ help="don't use live web (SPNv2)")
sub_single.add_argument('--type',
default="pdf",
help="type of ingest (pdf, html, etc)")
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 19e6dd7..869af06 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -41,6 +41,7 @@ def run_grobid(args):
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
s3_only=args.s3_only,
+ db_only=args.db_only,
)
pusher = JsonLinePusher(
worker,
@@ -135,6 +136,9 @@ def main():
sub_grobid.add_argument('--s3-only',
action='store_true',
help="only upload TEI-XML to S3 (don't write to database)")
+ sub_grobid.add_argument('--db-only',
+ action='store_true',
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
sub_grobid_disk = subparsers.add_parser('grobid-disk',
help="dump GRBOID output to (local) files on disk")
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 3d49096..492b558 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,7 +1,7 @@
from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
+from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
from .ingest import IngestFileWorker
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 99e92be..25697be 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -575,6 +575,7 @@ class WaybackClient:
next_url = start_url
urls_seen = [start_url]
for i in range(self.max_redirects):
+ print(" URL: {}".format(next_url), file=sys.stderr)
cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype)
#print(cdx_row, file=sys.stderr)
if not cdx_row:
@@ -659,7 +660,12 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
- next_url = resource.location
+ if resource.location.startswith('/'):
+ # redirect location does not include hostname
+ domain_prefix = '/'.join(next_url.split('/')[:3])
+ next_url = domain_prefix + resource.location
+ else:
+ next_url = resource.location
else:
next_url = self.fetch_replay_redirect(
url=cdx_row.url,
@@ -753,7 +759,7 @@ class SavePageNowClient:
self.poll_count = 60
self.poll_seconds = 3.0
- def save_url_now_v2(self, request_url):
+ def save_url_now_v2(self, request_url, force_get=0):
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -792,6 +798,7 @@ class SavePageNowClient:
'capture_all': 1,
'capture_screenshot': 0,
'if_not_archived_within': '1d',
+ 'force_get': force_get,
},
)
if resp.status_code == 429:
@@ -861,14 +868,14 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self, start_url, wayback_client):
+ def crawl_resource(self, start_url, wayback_client, force_get=0):
"""
Runs a SPN2 crawl, then fetches body from wayback.
TODO: possible to fetch from petabox?
"""
- spn_result = self.save_url_now_v2(start_url)
+ spn_result = self.save_url_now_v2(start_url, force_get=force_get)
if not spn_result.success:
status = spn_result.status
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 7211ee0..c9a697c 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -9,7 +9,7 @@ from collections import namedtuple
from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
from sandcrawler.grobid import GrobidClient
-from sandcrawler.misc import gen_file_metadata
+from sandcrawler.misc import gen_file_metadata, clean_url
from sandcrawler.html import extract_fulltext_url
from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
@@ -93,6 +93,15 @@ class IngestFileWorker(SandcrawlerWorker):
"digital.ucd.ie/", # ireland national historical
]
+ # these are special-case web domains for which we want SPN2 to not run
+ # a headless browser (brozzler), but instead simply run wget.
+ # the motivation could be to work around browser issues, or in the
+ # future possibly to increase download efficiency (wget/fetch being
+ # faster than browser fetch)
+ self.spn2_simple_get_domains = [
+ ]
+
+
def check_existing_ingest(self, base_url):
"""
Check in sandcrawler-db (postgres) to see if we have already ingested
@@ -138,7 +147,12 @@ class IngestFileWorker(SandcrawlerWorker):
if self.try_spn2 and (not resource or not resource.hit or soft404):
via = "spn2"
- resource = self.spn_client.crawl_resource(url, self.wayback_client)
+ force_get = 0
+ for domain in self.spn2_simple_get_domains:
+ if domain in url:
+ force_get = 1
+ break
+ resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get)
print("[FETCH {}\t] {}\t{}".format(
via,
resource.status,
@@ -224,7 +238,11 @@ class IngestFileWorker(SandcrawlerWorker):
request['ingest_type'] = "pdf"
assert request.get('ingest_type') == "pdf"
ingest_type = request.get('ingest_type')
- base_url = request['base_url']
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request['base_url'])
force_recrawl = bool(request.get('force_recrawl', False))
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 88669e6..d9c9d55 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -6,8 +6,15 @@ import datetime
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+import urlcanon
+def clean_url(s):
+ parsed = urlcanon.parse_url(s)
+ if not parsed.port and parsed.colon_before_port:
+ parsed.colon_before_port = b''
+ return str(urlcanon.whatwg(parsed))
+
def gen_file_metadata(blob):
"""
Takes a file blob (bytestream) and returns hashes and other metadata.
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index f5de44a..379fd8b 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -240,6 +240,8 @@ class PersistGrobidWorker(SandcrawlerWorker):
default_bucket=kwargs['s3_bucket'],
)
self.s3_only = kwargs.get('s3_only', False)
+ self.db_only = kwargs.get('db_only', False)
+ assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
def process(self, record):
"""
@@ -264,13 +266,14 @@ class PersistGrobidWorker(SandcrawlerWorker):
continue
assert len(r['key']) == 40
- resp = self.s3.put_blob(
- folder="grobid",
- blob=r['tei_xml'],
- sha1hex=r['key'],
- extension=".tei.xml",
- )
- self.counts['s3-put'] += 1
+ if not self.db_only:
+ resp = self.s3.put_blob(
+ folder="grobid",
+ blob=r['tei_xml'],
+ sha1hex=r['key'],
+ extension=".tei.xml",
+ )
+ self.counts['s3-put'] += 1
# enhance with teixml2json metadata, if available
try:
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index c290421..54bd581 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -52,6 +52,12 @@ class SandcrawlerWorker(object):
print("Worker: {}".format(self.counts), file=sys.stderr)
return self.counts
+ def process(self, task):
+ """
+ Derived workers need to implement business logic here.
+ """
+ raise NotImplementedError('implementation required')
+
class MultiprocessWrapper(SandcrawlerWorker):
def __init__(self, worker, sink, jobs=None):
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 0ba4d03..5720f48 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -58,6 +58,7 @@ def run_persist_grobid(args):
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
s3_only=args.s3_only,
+ db_only=args.db_only,
)
pusher = KafkaJsonPusher(
worker=worker,
@@ -173,6 +174,9 @@ def main():
sub_persist_grobid.add_argument('--s3-only',
action='store_true',
help="only upload TEI-XML to S3 (don't write to database)")
+ sub_persist_grobid.add_argument('--db-only',
+ action='store_true',
+ help="only write status to database (don't upload TEI-XML to S3)")
sub_persist_grobid.set_defaults(func=run_persist_grobid)
sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index fa46f10..03a1f29 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
"""
This script is intended to be used for backfill ingest of old crawls. It can
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 563855d..494ec7a 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
"""
This script is used to turn ingest request postgres rows (in JSON export
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index c51a152..2999574 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
"""
Transform an unpaywall dump (JSON) into ingest requests.
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 420bc07..29f9e9f 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
import pytest
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line
+from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
def test_gen_file_metadata():
@@ -69,3 +69,9 @@ def test_invalid_cdx():
print("bad datetime")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
assert parse_cdx_line(raw) == None
+
+def test_clean_url():
+ assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
+ assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
+ "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+