diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/Pipfile | 4 | ||||
-rw-r--r-- | python/Pipfile.lock | 438 | ||||
-rwxr-xr-x | python/ingest_file.py | 7 | ||||
-rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 8 | ||||
-rw-r--r-- | python/sandcrawler/misc.py | 7 | ||||
-rwxr-xr-x | python/scripts/arabesque2ingestrequest.py | 2 | ||||
-rwxr-xr-x | python/scripts/ingestrequest_row2json.py | 2 | ||||
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 2 | ||||
-rw-r--r-- | python/tests/test_misc.py | 8 |
10 files changed, 263 insertions, 217 deletions
diff --git a/python/Pipfile b/python/Pipfile index f7e59c5..fc63697 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -34,6 +34,10 @@ python-magic = "*" ftfy = "*" internetarchive = "*" Flask = ">=1" +urlcanon = "*" + +# this is only to lock to a python3.5-compatible version +zipp = "<2.0.0" [requires] python_version = "3.5" diff --git a/python/Pipfile.lock b/python/Pipfile.lock index 4de99d5..07e7484 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "87b092df5b5472b2b42e8d4c42e73319968c36729cc1bd3fb6de0eb3346f2a6b" + "sha256": "ac2202b1be9e35ebd698dd609af8d868afb3fdd0134c5581925138b51d4f475f" }, "pipfile-spec": 6, "requires": { @@ -21,6 +21,22 @@ ] }, "default": { + "aerospike": { + "hashes": [ + "sha256:3c3edb9c59491100cf5f9b0d802ee0b812b32b626c16358133cf5b9931ab8620", + "sha256:42e6ed4f6298aab4e5094f45a69fc805f925fbaa4ec206a87ce0a2048df02d4d", + "sha256:67684fb6af531765eb6061e37597bc73a348a2eff141795447ab20d9c6a61289", + "sha256:6aec5e0dbedb8ddd97441abaebedb04d4abbd51bfcfd6f0a6722fabc5be4efd0", + "sha256:9280ecb0257b0b706df7ac934dc03f518641934479d9c925a46af5231fb65f40", + "sha256:98779725a86ef345b9fec0b5ef60b59b2430b9c8c8e8904adb7945af6d6f9ffb", + "sha256:99de79a26f184a47a67123899e093cecd5c3bc0b0ce92da4f302684ad0b0116c", + "sha256:b170b637d69f49c02d021477359866c3d89a2c0d1477bec19343828f890d3cb1", + "sha256:d2f0b0288e2efafb99bbada6b39714285f317dc47fde3c4458b76e8cfbb71c11", + "sha256:d83faa27d40af320058a93902e19173f6295acbcc9ca225c552d7648169859f0", + "sha256:db7efad41300cb9bd6e70534c3110fce4e474db6d4288428609e0214a021aab8" + ], + "version": "==3.10.0" + }, "args": { "hashes": [ "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814" @@ -51,19 +67,18 @@ }, "boto3": { "hashes": [ - "sha256:5c00d51101d6a7ddf2207ae8a738e5c815c5fcffbee76121f38bd41d83c936a5", - "sha256:aa58c8de6aed36211e0897598de2a3d89122ad8cd1450165679720180ab880ef" + "sha256:5e145fa27b56c6f3db20c20d4e0084e146a400007064af8d46c1b3bfcc779c42", + "sha256:95d2fb21e3f0575f8b599706100c42a5a7ae576671f6f4beea6e42ab0f57166b" ], "index": "ia", - "version": "==1.10.50" + "version": "==1.12.18" }, "botocore": { "hashes": [ - "sha256:765a5c637ff792239727c327b221ed5a4d851e9f176ce8b8b9eca536425c74d4", - "sha256:adb4cb188cd0866e7337f9a049fc68db042b0340fd496d40bca349c8dbfc6a2d" + "sha256:2f1a54e19531bdf2d953c2db5f76b49c6936565366b2503a528b04cbbf55671a", + "sha256:9827c6f3bffb9e316427c276e3f22e5d82b377dd43dcbbbe71ecbd9ac2b959ab" ], - "markers": "python_version >= '2.6' and python_version != '3.1.*' and python_version != '3.0.*'", - "version": "==1.13.50" + "version": "==1.15.18" }, "brotli": { "hashes": [ @@ -126,11 +141,10 @@ }, "click": { "hashes": [ - "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", - "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" + "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc", + "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a" ], - "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'", - "version": "==7.0" + "version": "==7.1.1" }, "clint": { "hashes": [ @@ -138,6 +152,13 @@ ], "version": "==0.5.1" }, + "configparser": { + "hashes": [ + "sha256:254c1d9c79f60c45dfde850850883d5aaa7f19a23f13561243a050d5a7c3fe4c", + "sha256:c7d282687a5308319bf3d2e7706e575c635b0a470342641c93bea0ea3b5331df" + ], + "version": "==4.0.2" + }, "confluent-kafka": { "hashes": [ "sha256:0efd716da4f03f99d45fbb0d1583c5c8bf1eabc258a883588e3cd6ee06c0facb", @@ -187,31 +208,28 @@ "hashes": [ "sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.1.4.8" }, "dawg": { "hashes": [ - "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5", - "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953", - "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e", - "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8", - "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3", - "sha256:7d0a904e91adfa3de7071bfe64cd1334ce4040f1795cca8c13598bd075e72e18", - "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55", - "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9", - "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171", - "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9" + "sha256:28c4c934ab1ca74226a46e6213f919f5b0912bf9de87218264d4d94c15521753", + "sha256:34881e06278d4a54cf0b402c0c8b587bef0caa78f0eee595adc7a2aa530e48ce", + "sha256:73760ad1272b1b47997f1a768b8f3bf547c92475bcd62185f4ab7e1bc691964e", + "sha256:7aecc4c89243edaf1efe7a4d769d993a7cd9307a8a04f48e07c4fc7c44bdd38f", + "sha256:83ce4a73f7632b0ed31af16c2750533ecbed347bad1148a52f6436e348b5b7ac", + "sha256:d78929f5a7f7e083f5720992068535d133f0d3326f0c677c61c59256aa43d95e", + "sha256:e664a884ca48f2599ad5c2289d9b7f769e77d266560c79992e3db2cfce96cb1b", + "sha256:fb90b799fb7d6d728531840529c812a9ee17736da71e8a596ede8bfd6c62bf36", + "sha256:feb6073e0d02ac54389ad378e6c695e28fe579e2772c225a854299752effece6" ], - "version": "==0.7.8" + "version": "==0.8.0" }, "decorator": { "hashes": [ - "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce", - "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d" + "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760", + "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==4.4.1" + "version": "==4.4.2" }, "docopt": { "hashes": [ @@ -225,14 +243,12 @@ "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" ], - "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.15.2" }, "dogpile.cache": { "hashes": [ "sha256:b348835825c9dcd251d9aad1f89f257277ac198a3e35a61980ab4cb28c75216b" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.9.0" }, "elasticsearch": { @@ -240,7 +256,6 @@ "sha256:1f0f633e3b500d5042424f75a505badf8c4b9962c1b4734cdfb3087fb67920be", "sha256:fb5ab15ee283f104b5a7a5695c7e879cb2927e4eb5aed9c530811590b41259ad" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version < '4' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'", "version": "==6.4.0" }, "flask": { @@ -253,23 +268,22 @@ }, "ftfy": { "hashes": [ - "sha256:6d7509c45e602dec890f0f6ee0623a8b5f50ec1188ac7ab9535e18e572c99bcc" + "sha256:67f9c8b33a4b742376a3eda11b0e3bd5c0cbe719d95ea0bfd3736a7bdd1c24c8" ], "index": "ia", - "version": "==5.6" + "version": "==5.7" }, "globalwayback": { "hashes": [ - "sha256:ddd1fac7caad4181e8e623cb67ef3f6a6f7c0f306140c450b92f8bb3032aba51" + "sha256:6f7bd270ef827c1d8c1d5631c313dbf401e300993b593179699741fcdbc5295d" ], "index": "ia", - "version": "==0.4.13" + "version": "==0.5.2" }, "ialib": { "hashes": [ "sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==0.3.0.1" }, "idna": { @@ -292,38 +306,34 @@ "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19", "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749" ], - "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'", "version": "==1.1.0" }, "jinja2": { "hashes": [ - "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", - "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250", + "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==2.10.3" + "version": "==2.11.1" }, "jmespath": { "hashes": [ - "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6", - "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c" + "sha256:695cb76fa78a10663425d5b73ddc5714eb711157e52704d69be03b1a02ba4fec", + "sha256:cca55c8d153173e21baa59983015ad0daf603f9cb799904ff057bfb8ff8dc2d9" ], - "version": "==0.9.4" + "version": "==0.9.5" }, "jsonpatch": { "hashes": [ - "sha256:83f29a2978c13da29bfdf89da9d65542d62576479caf215df19632d7dc04c6e6", - "sha256:cbb72f8bf35260628aea6b508a107245f757d1ec839a19c34349985e2c05645a" + "sha256:cc3a7241010a1fd3f50145a3b33be2c03c1e679faa19934b628bb07d0f64819e", + "sha256:ddc0f7628b8bfdd62e3cbfbc24ca6671b0b6265b50d186c2cf3659dc0f78fd6a" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==1.24" + "version": "==1.25" }, "jsonpointer": { "hashes": [ "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362", "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", "version": "==2.0" }, "kazoo": { @@ -331,7 +341,6 @@ "sha256:8db774f7bdece7d0dc7decb21539ff0852e42c2ffe1c28d7f1ff6f9292a1c3a4", "sha256:a5fa2e400c5068cfee9e86b35cf0dab8232b574152d8e3590d823b3e2426ab5e" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==2.5.0" }, "markupsafe": { @@ -340,13 +349,16 @@ "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", @@ -363,42 +375,68 @@ "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", "version": "==1.1.1" }, "minio": { "hashes": [ - "sha256:daef713dfaaa232a719c4dc58f0bed90b6d11ab14e94cd013cef7e7ed5b2cfdc", - "sha256:e5dc4670fe5c3e0ef61e9556bbae4768fcb051f3deb73d89e40a1b470dc9bd88" + "sha256:7543f990231b0d605f35b9140ec51cdb6335e741bb8c45dea9b8746c248a54bf", + "sha256:bf9b5001273108864e09a330a0a1795c6caee6036a124a607916830544afbf5f", + "sha256:f1987811525120d68a420fc6142ca310df174123da56742233824342e3a8c3f8" ], "index": "ia", - "version": "==5.0.6" + "version": "==5.0.7" }, "pillow": { "hashes": [ + "sha256:02d2ae1d7c311e6fa038abda0843683ae652c9292d723270c85deeb04a1001a8", "sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b", + "sha256:1937c9e17f685fe6c360dd96ddb8f93f159ac721939ccbfc91a62d8124a29945", + "sha256:1ce73edaeb49af9ebeacfb8c58428ae39592839d3a7a16ef3926773f1c8ff8ee", + "sha256:2052f0372123c98497ee3294f4e20347d87b9f70d9c65ce2fc520b9339aa8465", "sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe", + "sha256:26b4a2bcdf0e674505fcd2f1a882f29a99339ac3b5a8e7997b90ede2995434e2", + "sha256:282e069f92e43047b34bbd995a8800669af11d038db571758708ebcd96462964", + "sha256:2ee30463cbc8e60cba92722a3a64881a0d3df534a047e299a9bcf62ea34fd061", "sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b", "sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608", "sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac", "sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389", + "sha256:4756b77682c7335751a2cfc0e9e6d96945d88ffd315420d9010235021ddfc64c", "sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff", + "sha256:502275e7a83872e62e0cd0be4da575a53a1f9703341aba814527dacdda3660a8", "sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f", + "sha256:5bcea0df97fe0b911a6629aab0997b98e8811561c27167266758a7ede173123e", "sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b", + "sha256:61cb7e1ff212293d74155397fad008b052bc9633efdf9caa4271c316cd25b99f", + "sha256:68e45ed91531d3b05a17a356fd4cb928603a00259916e057730a024b029d6b51", + "sha256:69c3cc797a66241ed2fa61ff6f52c73e7bd3e738d80d64abb3500e6fbdec30a3", + "sha256:69ec2f0effe8b395e55929bb4d9a3ca8ff0c40f85d61d00e1e5b1e504d28b5e2", "sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3", "sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05", + "sha256:82351254350d9212bccf71d387efae8ad8f6f4b904d095546a77852a6b16e05b", + "sha256:95236f64904157256254b6cc8e29feecd9ee6985732dcb36c9f58d7dabe081d0", "sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64", "sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7", "sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842", + "sha256:9e274583a0eab0b6d227139146e28f74488cfbc0d262c4ba2e5c0998b9c498d2", "sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a", + "sha256:a10befeb7b9975d7c3d2ca3eaf0cb505db98fe50874130e182c2a6f7a606591f", "sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3", + "sha256:b55f62882d8db466fcf2228422bf3147617744888bf0cf6dffb3254a52eb316b", + "sha256:bf83901c158ad92e77e990f51531434e5a96c6aef805a84b6e3bfe825f4d4d0c", + "sha256:c32f99a0c7c5313b2df78399ef908563b319de23bec0cc89f1d04c37be19eb2d", "sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984", + "sha256:cf4b3c634b317ee247c3add4375b0a6bdc45eb0c12a5d7fbf9bfd47ec10b020f", "sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3", "sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c", + "sha256:e04df3808d6202dd552c837c824796899c09ff0ff9c335607904e31f9d387110", "sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb", "sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0", + "sha256:eae3711a7916eb5ec800dfb6963da09db0ada63c0481639dd0ddc0b505883a02", "sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c", "sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394" ], @@ -431,6 +469,7 @@ }, "pykafka": { "hashes": [ + "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459", "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577" ], "index": "ia", @@ -454,7 +493,6 @@ "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" ], - "markers": "python_version >= '2.7'", "version": "==2.8.1" }, "python-magic": { @@ -497,7 +535,6 @@ "sha256:e3a057b7a64f1222b56e47bcff5e4b94c4f61faac04c7c4ecb1985e18caa3994", "sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615" ], - "markers": "python_version != '3.2.*' and python_version != '3.4.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'", "version": "==5.3" }, "raven": { @@ -506,24 +543,22 @@ "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4" ], "index": "ia", - "markers": null, "version": "==6.10.0" }, "redis": { "hashes": [ - "sha256:3613daad9ce5951e426f460deddd5caf469e08a3af633e9578fc77d362becf62", - "sha256:8d0fc278d3f5e1249967cba2eb4a5632d19e45ce5c09442b8422d15ee2c22cc2" + "sha256:0dcfb335921b88a850d461dc255ff4708294943322bd55de6cfd68972490ca1f", + "sha256:b205cffd05ebfd0a468db74f0eedbff8df1a7bfc47521516ade4692991bb0833" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==3.3.11" + "version": "==3.4.1" }, "requests": { "hashes": [ - "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", - "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" ], "index": "ia", - "version": "==2.22.0" + "version": "==2.23.0" }, "requests-file": { "hashes": [ @@ -540,11 +575,10 @@ }, "s3transfer": { "hashes": [ - "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d", - "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba" + "sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13", + "sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", - "version": "==0.2.1" + "version": "==0.3.3" }, "schedule": { "hashes": [ @@ -562,25 +596,23 @@ }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==1.13.0" + "version": "==1.14.0" }, "soupsieve": { "hashes": [ - "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5", - "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda" + "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae", + "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69" ], - "version": "==1.9.5" + "version": "==2.0" }, "sqlalchemy": { "hashes": [ - "sha256:bfb8f464a5000b567ac1d350b9090cf081180ec1ab4aa87e7bca12dab25320ec" + "sha256:b92d2de62e43499d85b1780274d1b562e5159c7996f6f04a9bb46cf681ced45f" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==1.3.12" + "version": "==1.3.14" }, "surt": { "hashes": [ @@ -600,7 +632,6 @@ "sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7", "sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==2.2.2" }, "twitter": { @@ -610,12 +641,18 @@ ], "version": "==1.18.0" }, + "urlcanon": { + "hashes": [ + "sha256:30f5bf0e2e4a0feb6dd9ee139a4180a5d493117e8a1448569da3d73c18b92b62" + ], + "index": "ia", + "version": "==0.3.1" + }, "urllib3": { "hashes": [ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" ], - "markers": "python_version >= '3.4'", "version": "==1.22" }, "warctools": { @@ -626,10 +663,10 @@ }, "wayback": { "hashes": [ - "sha256:3e89df1a3cb49baffe03572a77d00d97d54ccebeb4dd24f19d8f2b8ec3812ad3" + "sha256:936ae4c75af922e0f4b1bc82c66f51f97687cc6a1b8f3f1a19ec8fa7ab11ec41" ], "index": "ia", - "version": "==0.5.1" + "version": "==0.5.3" }, "wayback-esp": { "hashes": [ @@ -647,7 +684,6 @@ "hashes": [ "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==0.1.5" }, "wcwidth": { @@ -659,11 +695,18 @@ }, "werkzeug": { "hashes": [ - "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7", - "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4" + "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096", + "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==0.16.0" + "version": "==1.0.0" + }, + "zipp": { + "hashes": [ + "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1", + "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921" + ], + "index": "ia", + "version": "==1.2.0" } }, "develop": { @@ -674,20 +717,11 @@ ], "version": "==2.3.3" }, - "atomicwrites": { - "hashes": [ - "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", - "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" - ], - "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'", - "version": "==1.3.0" - }, "attrs": { "hashes": [ "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==19.3.0" }, "backcall": { @@ -713,48 +747,46 @@ }, "coverage": { "hashes": [ - "sha256:189aac76d6e0d7af15572c51892e7326ee451c076c5a50a9d266406cd6c49708", - "sha256:1bf7ba2af1d373a1750888724f84cffdfc697738f29a353c98195f98fc011509", - "sha256:1f4ee8e2e4243971618bc16fcc4478317405205f135e95226c2496e2a3b8dbbf", - "sha256:225e79a5d485bc1642cb7ba02281419c633c216cdc6b26c26494ba959f09e69f", - "sha256:23688ff75adfa8bfa2a67254d889f9bdf9302c27241d746e17547c42c732d3f4", - "sha256:28f7f73b34a05e23758e860a89a7f649b85c6749e252eff60ebb05532d180e86", - "sha256:2d0cb9b1fe6ad0d915d45ad3d87f03a38e979093a98597e755930db1f897afae", - "sha256:47874b4711c5aeb295c31b228a758ce3d096be83dc37bd56da48ed99efb8813b", - "sha256:511ec0c00840e12fb4e852e4db58fa6a01ca4da72f36a9766fae344c3d502033", - "sha256:53e7438fef0c97bc248f88ba1edd10268cd94d5609970aaf87abbe493691af87", - "sha256:569f9ee3025682afda6e9b0f5bb14897c0db03f1a1dc088b083dd36e743f92bb", - "sha256:593853aa1ac6dcc6405324d877544c596c9d948ef20d2e9512a0f5d2d3202356", - "sha256:5b0a07158360d22492f9abd02a0f2ee7981b33f0646bf796598b7673f6bbab14", - "sha256:7ca3db38a61f3655a2613ee2c190d63639215a7a736d3c64cc7bbdb002ce6310", - "sha256:7d1cc7acc9ce55179616cf72154f9e648136ea55987edf84addbcd9886ffeba2", - "sha256:88b51153657612aea68fa684a5b88037597925260392b7bb4509d4f9b0bdd889", - "sha256:955ec084f549128fa2702f0b2dc696392001d986b71acd8fd47424f28289a9c3", - "sha256:b251c7092cbb6d789d62dc9c9e7c4fb448c9138b51285c36aeb72462cad3600e", - "sha256:bd82b684bb498c60ef47bb1541a50e6d006dde8579934dcbdbc61d67d1ea70d9", - "sha256:bfe102659e2ec13b86c7f3b1db6c9a4e7beea4255058d006351339e6b342d5d2", - "sha256:c1e4e39e43057396a5e9d069bfbb6ffeee892e40c5d2effbd8cd71f34ee66c4d", - "sha256:cb2b74c123f65e8166f7e1265829a6c8ed755c3cd16d7f50e75a83456a5f3fd7", - "sha256:cca38ded59105f7705ef6ffe1e960b8db6c7d8279c1e71654a4775ab4454ca15", - "sha256:cf908840896f7aa62d0ec693beb53264b154f972eb8226fb864ac38975590c4f", - "sha256:d095a7b473f8a95f7efe821f92058c8a2ecfb18f8db6677ae3819e15dc11aaae", - "sha256:d22b4297e7e4225ccf01f1aa55e7a96412ea0796b532dd614c3fcbafa341128e", - "sha256:d4a2b578a7a70e0c71f662705262f87a456f1e6c1e40ada7ea699abaf070a76d", - "sha256:ddeb42a3d5419434742bf4cc71c9eaa22df3b76808e23a82bd0b0bd360f1a9f1", - "sha256:e65a5aa1670db6263f19fdc03daee1d7dbbadb5cb67fd0a1f16033659db13c1d", - "sha256:eaad65bd20955131bcdb3967a4dea66b4e4d4ca488efed7c00d91ee0173387e8", - "sha256:f45fba420b94165c17896861bb0e8b27fb7abdcedfeb154895d8553df90b7b00" - ], - "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version < '4' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'", - "version": "==5.0.2" + "sha256:15cf13a6896048d6d947bf7d222f36e4809ab926894beb748fc9caa14605d9c3", + "sha256:1daa3eceed220f9fdb80d5ff950dd95112cd27f70d004c7918ca6dfc6c47054c", + "sha256:1e44a022500d944d42f94df76727ba3fc0a5c0b672c358b61067abb88caee7a0", + "sha256:25dbf1110d70bab68a74b4b9d74f30e99b177cde3388e07cc7272f2168bd1477", + "sha256:3230d1003eec018ad4a472d254991e34241e0bbd513e97a29727c7c2f637bd2a", + "sha256:3dbb72eaeea5763676a1a1efd9b427a048c97c39ed92e13336e726117d0b72bf", + "sha256:5012d3b8d5a500834783689a5d2292fe06ec75dc86ee1ccdad04b6f5bf231691", + "sha256:51bc7710b13a2ae0c726f69756cf7ffd4362f4ac36546e243136187cfcc8aa73", + "sha256:527b4f316e6bf7755082a783726da20671a0cc388b786a64417780b90565b987", + "sha256:722e4557c8039aad9592c6a4213db75da08c2cd9945320220634f637251c3894", + "sha256:76e2057e8ffba5472fd28a3a010431fd9e928885ff480cb278877c6e9943cc2e", + "sha256:77afca04240c40450c331fa796b3eab6f1e15c5ecf8bf2b8bee9706cd5452fef", + "sha256:7afad9835e7a651d3551eab18cbc0fdb888f0a6136169fbef0662d9cdc9987cf", + "sha256:9bea19ac2f08672636350f203db89382121c9c2ade85d945953ef3c8cf9d2a68", + "sha256:a8b8ac7876bc3598e43e2603f772d2353d9931709345ad6c1149009fd1bc81b8", + "sha256:b0840b45187699affd4c6588286d429cd79a99d509fe3de0f209594669bb0954", + "sha256:b26aaf69713e5674efbde4d728fb7124e429c9466aeaf5f4a7e9e699b12c9fe2", + "sha256:b63dd43f455ba878e5e9f80ba4f748c0a2156dde6e0e6e690310e24d6e8caf40", + "sha256:be18f4ae5a9e46edae3f329de2191747966a34a3d93046dbdf897319923923bc", + "sha256:c312e57847db2526bc92b9bfa78266bfbaabac3fdcd751df4d062cd4c23e46dc", + "sha256:c60097190fe9dc2b329a0eb03393e2e0829156a589bd732e70794c0dd804258e", + "sha256:c62a2143e1313944bf4a5ab34fd3b4be15367a02e9478b0ce800cb510e3bbb9d", + "sha256:cc1109f54a14d940b8512ee9f1c3975c181bbb200306c6d8b87d93376538782f", + "sha256:cd60f507c125ac0ad83f05803063bed27e50fa903b9c2cfee3f8a6867ca600fc", + "sha256:d513cc3db248e566e07a0da99c230aca3556d9b09ed02f420664e2da97eac301", + "sha256:d649dc0bcace6fcdb446ae02b98798a856593b19b637c1b9af8edadf2b150bea", + "sha256:d7008a6796095a79544f4da1ee49418901961c97ca9e9d44904205ff7d6aa8cb", + "sha256:da93027835164b8223e8e5af2cf902a4c80ed93cb0909417234f4a9df3bcd9af", + "sha256:e69215621707119c6baf99bda014a45b999d37602cb7043d943c76a59b05bf52", + "sha256:ea9525e0fef2de9208250d6c5aeeee0138921057cd67fcef90fbed49c4d62d37", + "sha256:fca1669d464f0c9831fd10be2eef6b86f5ebd76c724d1e0706ebdff86bb4adf0" + ], + "version": "==5.0.3" }, "decorator": { "hashes": [ - "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce", - "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d" + "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760", + "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==4.4.1" + "version": "==4.4.2" }, "idna": { "hashes": [ @@ -765,11 +797,11 @@ }, "importlib-metadata": { "hashes": [ - "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45", - "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f" + "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302", + "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b" ], "markers": "python_version < '3.8'", - "version": "==1.3.0" + "version": "==1.5.0" }, "ipython": { "hashes": [ @@ -791,15 +823,14 @@ "sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1", "sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==4.3.21" }, "jedi": { "hashes": [ - "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064", - "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807" + "sha256:b4f4052551025c6b0b0b193b29a6ff7bdb74c52450631206c262aef9f7159ad2", + "sha256:d5c871cb9360b414f981e7072c52c33258d598305280fef91c6cae34739d65d5" ], - "version": "==0.15.2" + "version": "==0.16.0" }, "lazy-object-proxy": { "hashes": [ @@ -825,7 +856,6 @@ "sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4", "sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==1.4.3" }, "mccabe": { @@ -837,26 +867,24 @@ }, "more-itertools": { "hashes": [ - "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d", - "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564" + "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c", + "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507" ], - "markers": "python_version > '2.7'", - "version": "==8.0.2" + "version": "==8.2.0" }, "packaging": { "hashes": [ - "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb", - "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8" + "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3", + "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==20.0" + "version": "==20.3" }, "parso": { "hashes": [ - "sha256:55cf25df1a35fd88b878715874d2c4dc1ad3f0eebd1e0266a67e1f55efccfbe1", - "sha256:5c1f7791de6bd5dbbeac8db0ef5594b36799de198b3f7f7014643b0c5536b9d3" + "sha256:0c5659e0c6eba20636f99a04f469798dca8da279645ce5c387315b2c23912157", + "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995" ], - "version": "==0.5.2" + "version": "==0.6.2" }, "pathlib2": { "hashes": [ @@ -868,11 +896,11 @@ }, "pexpect": { "hashes": [ - "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", - "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" + "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937", + "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c" ], "markers": "sys_platform != 'win32'", - "version": "==4.7.0" + "version": "==4.8.0" }, "pickleshare": { "hashes": [ @@ -886,7 +914,6 @@ "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" ], - "markers": "python_version >= '3.5'", "version": "==0.13.1" }, "prompt-toolkit": { @@ -895,7 +922,6 @@ "sha256:dd4fca02c8069497ad931a2d09914c6b0d1b50151ce876bc15bde4c747090126", "sha256:f7eec66105baf40eda9ab026cd8b2e251337eea8d111196695d82e0c5f0af852" ], - "markers": "python_version != '3.1.*' and python_version >= '2.6' and python_version != '3.0.*'", "version": "==1.0.18" }, "ptyprocess": { @@ -910,16 +936,14 @@ "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa", "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==1.8.1" }, "pygments": { "hashes": [ - "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b", - "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe" + "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44", + "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324" ], - "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'", - "version": "==2.5.2" + "version": "==2.6.1" }, "pylint": { "hashes": [ @@ -934,16 +958,15 @@ "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f", "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", "version": "==2.4.6" }, "pytest": { "hashes": [ - "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa", - "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4" + "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d", + "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6" ], "index": "ia", - "version": "==5.3.2" + "version": "==5.3.5" }, "pytest-cov": { "hashes": [ @@ -963,12 +986,11 @@ }, "pytest-pylint": { "hashes": [ - "sha256:8c38ea779e540e27ec4378b0820d906006e09f4ac834defbd886abbf57c7d2ec", - "sha256:a4f5e5007f88c2095dcac799e9f7eed3d7e7a2e657596e26093814980ff5fa20", - "sha256:a574c246535308f8f6ceac10fa82f8fffffa837071f7985b22515895185700c1" + "sha256:cac5d565182f39fbb7fa7f4ef1bbcc979e8f5cc260450ec72dc5aafeb782531f", + "sha256:dd3e232da5703e7fd14c610247dbe25dfd8e3278069b4b8bcf9778ba06b77569" ], "index": "ia", - "version": "==0.14.1" + "version": "==0.15.1" }, "pytest-pythonpath": { "hashes": [ @@ -979,19 +1001,19 @@ }, "requests": { "hashes": [ - "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", - "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" ], "index": "ia", - "version": "==2.22.0" + "version": "==2.23.0" }, "responses": { "hashes": [ - "sha256:515fd7c024097e5da76e9c4cf719083d181f1c3ddc09c2e0e49284ce863dd263", - "sha256:8ce8cb4e7e1ad89336f8865af152e0563d2e7f0e0b86d2cf75f015f819409243" + "sha256:0474ce3c897fbcc1aef286117c93499882d5c440f06a805947e4b1cb5ab3d474", + "sha256:f83613479a021e233e82d52ffb3e2e0e2836d24b0cc88a0fa31978789f78d0e5" ], "index": "ia", - "version": "==0.10.9" + "version": "==0.10.12" }, "simplegeneric": { "hashes": [ @@ -1001,52 +1023,50 @@ }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==1.13.0" + "version": "==1.14.0" }, "traitlets": { "hashes": [ "sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44", "sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", "version": "==4.3.3" }, "typed-ast": { "hashes": [ - "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", - "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", - "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", - "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", - "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", - "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", - "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", - "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", - "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", - "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", - "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", - "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", - "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", - "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", - "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", - "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", - "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", - "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", - "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", - "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" + "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", + "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", + "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", + "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", + "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", + "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", + "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", + "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", + "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", + "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", + "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", + "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", + "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", + "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", + "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", + "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", + "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", + "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", + "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", + "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", + "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" ], "markers": "implementation_name == 'cpython' and python_version < '3.8'", - "version": "==1.4.0" + "version": "==1.4.1" }, "urllib3": { "hashes": [ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" ], - "markers": "python_version >= '3.4'", "version": "==1.22" }, "wcwidth": { @@ -1064,11 +1084,11 @@ }, "zipp": { "hashes": [ - "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", - "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1", + "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921" ], - "markers": "python_version >= '3.5'", - "version": "==0.6.0" + "index": "ia", + "version": "==1.2.0" } } } diff --git a/python/ingest_file.py b/python/ingest_file.py index d4fdcac..f6f694e 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -17,7 +17,9 @@ def run_single_ingest(args): ) if args.force_recrawl: request['force_recrawl'] = True - ingester = IngestFileWorker() + ingester = IngestFileWorker( + try_spn2=not args.no_spn2, + ) result = ingester.process(request) print(json.dumps(result, sort_keys=True)) return result @@ -51,6 +53,9 @@ def main(): sub_single.add_argument('--force-recrawl', action='store_true', help="ignore GWB history and use SPNv2 to re-crawl") + sub_single.add_argument('--no-spn2', + action='store_true', + help="don't use live web (SPNv2)") sub_single.add_argument('--type', default="pdf", help="type of ingest (pdf, html, etc)") diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 3d49096..492b558 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,7 +1,7 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker -from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime +from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 7211ee0..5dc5b55 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -9,7 +9,7 @@ from collections import namedtuple from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult from sandcrawler.grobid import GrobidClient -from sandcrawler.misc import gen_file_metadata +from sandcrawler.misc import gen_file_metadata, clean_url from sandcrawler.html import extract_fulltext_url from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient @@ -224,7 +224,11 @@ class IngestFileWorker(SandcrawlerWorker): request['ingest_type'] = "pdf" assert request.get('ingest_type') == "pdf" ingest_type = request.get('ingest_type') - base_url = request['base_url'] + + # parse/clean URL + # note that we pass through the original/raw URL, and that is what gets + # persisted in database table + base_url = clean_url(request['base_url']) force_recrawl = bool(request.get('force_recrawl', False)) diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 88669e6..d9c9d55 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -6,8 +6,15 @@ import datetime import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error +import urlcanon +def clean_url(s): + parsed = urlcanon.parse_url(s) + if not parsed.port and parsed.colon_before_port: + parsed.colon_before_port = b'' + return str(urlcanon.whatwg(parsed)) + def gen_file_metadata(blob): """ Takes a file blob (bytestream) and returns hashes and other metadata. diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index fa46f10..03a1f29 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ This script is intended to be used for backfill ingest of old crawls. It can diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 563855d..494ec7a 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ This script is used to turn ingest request postgres rows (in JSON export diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index c51a152..2999574 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ Transform an unpaywall dump (JSON) into ingest requests. diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 420bc07..29f9e9f 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,7 +1,7 @@ import pytest -from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line +from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url def test_gen_file_metadata(): @@ -69,3 +69,9 @@ def test_invalid_cdx(): print("bad datetime") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" assert parse_cdx_line(raw) == None + +def test_clean_url(): + assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf" + assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \ + "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view" + |