diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/Pipfile | 4 | ||||
-rw-r--r-- | python/Pipfile.lock | 438 | ||||
-rwxr-xr-x | python/ingest_file.py | 7 | ||||
-rwxr-xr-x | python/persist_tool.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/ia.py | 15 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 24 | ||||
-rw-r--r-- | python/sandcrawler/misc.py | 7 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 17 | ||||
-rw-r--r-- | python/sandcrawler/workers.py | 6 | ||||
-rwxr-xr-x | python/sandcrawler_worker.py | 4 | ||||
-rwxr-xr-x | python/scripts/arabesque2ingestrequest.py | 2 | ||||
-rwxr-xr-x | python/scripts/ingestrequest_row2json.py | 2 | ||||
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 2 | ||||
-rw-r--r-- | python/tests/test_misc.py | 8 |
15 files changed, 313 insertions, 229 deletions
diff --git a/python/Pipfile b/python/Pipfile index f7e59c5..fc63697 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -34,6 +34,10 @@ python-magic = "*" ftfy = "*" internetarchive = "*" Flask = ">=1" +urlcanon = "*" + +# this is only to lock to a python3.5-compatible version +zipp = "<2.0.0" [requires] python_version = "3.5" diff --git a/python/Pipfile.lock b/python/Pipfile.lock index 4de99d5..07e7484 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "87b092df5b5472b2b42e8d4c42e73319968c36729cc1bd3fb6de0eb3346f2a6b" + "sha256": "ac2202b1be9e35ebd698dd609af8d868afb3fdd0134c5581925138b51d4f475f" }, "pipfile-spec": 6, "requires": { @@ -21,6 +21,22 @@ ] }, "default": { + "aerospike": { + "hashes": [ + "sha256:3c3edb9c59491100cf5f9b0d802ee0b812b32b626c16358133cf5b9931ab8620", + "sha256:42e6ed4f6298aab4e5094f45a69fc805f925fbaa4ec206a87ce0a2048df02d4d", + "sha256:67684fb6af531765eb6061e37597bc73a348a2eff141795447ab20d9c6a61289", + "sha256:6aec5e0dbedb8ddd97441abaebedb04d4abbd51bfcfd6f0a6722fabc5be4efd0", + "sha256:9280ecb0257b0b706df7ac934dc03f518641934479d9c925a46af5231fb65f40", + "sha256:98779725a86ef345b9fec0b5ef60b59b2430b9c8c8e8904adb7945af6d6f9ffb", + "sha256:99de79a26f184a47a67123899e093cecd5c3bc0b0ce92da4f302684ad0b0116c", + "sha256:b170b637d69f49c02d021477359866c3d89a2c0d1477bec19343828f890d3cb1", + "sha256:d2f0b0288e2efafb99bbada6b39714285f317dc47fde3c4458b76e8cfbb71c11", + "sha256:d83faa27d40af320058a93902e19173f6295acbcc9ca225c552d7648169859f0", + "sha256:db7efad41300cb9bd6e70534c3110fce4e474db6d4288428609e0214a021aab8" + ], + "version": "==3.10.0" + }, "args": { "hashes": [ "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814" @@ -51,19 +67,18 @@ }, "boto3": { "hashes": [ - "sha256:5c00d51101d6a7ddf2207ae8a738e5c815c5fcffbee76121f38bd41d83c936a5", - "sha256:aa58c8de6aed36211e0897598de2a3d89122ad8cd1450165679720180ab880ef" + "sha256:5e145fa27b56c6f3db20c20d4e0084e146a400007064af8d46c1b3bfcc779c42", + "sha256:95d2fb21e3f0575f8b599706100c42a5a7ae576671f6f4beea6e42ab0f57166b" ], "index": "ia", - "version": "==1.10.50" + "version": "==1.12.18" }, "botocore": { "hashes": [ - "sha256:765a5c637ff792239727c327b221ed5a4d851e9f176ce8b8b9eca536425c74d4", - "sha256:adb4cb188cd0866e7337f9a049fc68db042b0340fd496d40bca349c8dbfc6a2d" + "sha256:2f1a54e19531bdf2d953c2db5f76b49c6936565366b2503a528b04cbbf55671a", + "sha256:9827c6f3bffb9e316427c276e3f22e5d82b377dd43dcbbbe71ecbd9ac2b959ab" ], - "markers": "python_version >= '2.6' and python_version != '3.1.*' and python_version != '3.0.*'", - "version": "==1.13.50" + "version": "==1.15.18" }, "brotli": { "hashes": [ @@ -126,11 +141,10 @@ }, "click": { "hashes": [ - "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", - "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" + "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc", + "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a" ], - "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'", - "version": "==7.0" + "version": "==7.1.1" }, "clint": { "hashes": [ @@ -138,6 +152,13 @@ ], "version": "==0.5.1" }, + "configparser": { + "hashes": [ + "sha256:254c1d9c79f60c45dfde850850883d5aaa7f19a23f13561243a050d5a7c3fe4c", + "sha256:c7d282687a5308319bf3d2e7706e575c635b0a470342641c93bea0ea3b5331df" + ], + "version": "==4.0.2" + }, "confluent-kafka": { "hashes": [ "sha256:0efd716da4f03f99d45fbb0d1583c5c8bf1eabc258a883588e3cd6ee06c0facb", @@ -187,31 +208,28 @@ "hashes": [ "sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.1.4.8" }, "dawg": { "hashes": [ - "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5", - "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953", - "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e", - "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8", - "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3", - "sha256:7d0a904e91adfa3de7071bfe64cd1334ce4040f1795cca8c13598bd075e72e18", - "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55", - "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9", - "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171", - "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9" + "sha256:28c4c934ab1ca74226a46e6213f919f5b0912bf9de87218264d4d94c15521753", + "sha256:34881e06278d4a54cf0b402c0c8b587bef0caa78f0eee595adc7a2aa530e48ce", + "sha256:73760ad1272b1b47997f1a768b8f3bf547c92475bcd62185f4ab7e1bc691964e", + "sha256:7aecc4c89243edaf1efe7a4d769d993a7cd9307a8a04f48e07c4fc7c44bdd38f", + "sha256:83ce4a73f7632b0ed31af16c2750533ecbed347bad1148a52f6436e348b5b7ac", + "sha256:d78929f5a7f7e083f5720992068535d133f0d3326f0c677c61c59256aa43d95e", + "sha256:e664a884ca48f2599ad5c2289d9b7f769e77d266560c79992e3db2cfce96cb1b", + "sha256:fb90b799fb7d6d728531840529c812a9ee17736da71e8a596ede8bfd6c62bf36", + "sha256:feb6073e0d02ac54389ad378e6c695e28fe579e2772c225a854299752effece6" ], - "version": "==0.7.8" + "version": "==0.8.0" }, "decorator": { "hashes": [ - "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce", - "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d" + "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760", + "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==4.4.1" + "version": "==4.4.2" }, "docopt": { "hashes": [ @@ -225,14 +243,12 @@ "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" ], - "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.15.2" }, "dogpile.cache": { "hashes": [ "sha256:b348835825c9dcd251d9aad1f89f257277ac198a3e35a61980ab4cb28c75216b" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.9.0" }, "elasticsearch": { @@ -240,7 +256,6 @@ "sha256:1f0f633e3b500d5042424f75a505badf8c4b9962c1b4734cdfb3087fb67920be", "sha256:fb5ab15ee283f104b5a7a5695c7e879cb2927e4eb5aed9c530811590b41259ad" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version < '4' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'", "version": "==6.4.0" }, "flask": { @@ -253,23 +268,22 @@ }, "ftfy": { "hashes": [ - "sha256:6d7509c45e602dec890f0f6ee0623a8b5f50ec1188ac7ab9535e18e572c99bcc" + "sha256:67f9c8b33a4b742376a3eda11b0e3bd5c0cbe719d95ea0bfd3736a7bdd1c24c8" ], "index": "ia", - "version": "==5.6" + "version": "==5.7" }, "globalwayback": { "hashes": [ - "sha256:ddd1fac7caad4181e8e623cb67ef3f6a6f7c0f306140c450b92f8bb3032aba51" + "sha256:6f7bd270ef827c1d8c1d5631c313dbf401e300993b593179699741fcdbc5295d" ], "index": "ia", - "version": "==0.4.13" + "version": "==0.5.2" }, "ialib": { "hashes": [ "sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==0.3.0.1" }, "idna": { @@ -292,38 +306,34 @@ "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19", "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749" ], - "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'", "version": "==1.1.0" }, "jinja2": { "hashes": [ - "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", - "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250", + "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==2.10.3" + "version": "==2.11.1" }, "jmespath": { "hashes": [ - "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6", - "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c" + "sha256:695cb76fa78a10663425d5b73ddc5714eb711157e52704d69be03b1a02ba4fec", + "sha256:cca55c8d153173e21baa59983015ad0daf603f9cb799904ff057bfb8ff8dc2d9" ], - "version": "==0.9.4" + "version": "==0.9.5" }, "jsonpatch": { "hashes": [ - "sha256:83f29a2978c13da29bfdf89da9d65542d62576479caf215df19632d7dc04c6e6", - "sha256:cbb72f8bf35260628aea6b508a107245f757d1ec839a19c34349985e2c05645a" + "sha256:cc3a7241010a1fd3f50145a3b33be2c03c1e679faa19934b628bb07d0f64819e", + "sha256:ddc0f7628b8bfdd62e3cbfbc24ca6671b0b6265b50d186c2cf3659dc0f78fd6a" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==1.24" + "version": "==1.25" }, "jsonpointer": { "hashes": [ "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362", "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", "version": "==2.0" }, "kazoo": { @@ -331,7 +341,6 @@ "sha256:8db774f7bdece7d0dc7decb21539ff0852e42c2ffe1c28d7f1ff6f9292a1c3a4", "sha256:a5fa2e400c5068cfee9e86b35cf0dab8232b574152d8e3590d823b3e2426ab5e" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==2.5.0" }, "markupsafe": { @@ -340,13 +349,16 @@ "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", @@ -363,42 +375,68 @@ "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", "version": "==1.1.1" }, "minio": { "hashes": [ - "sha256:daef713dfaaa232a719c4dc58f0bed90b6d11ab14e94cd013cef7e7ed5b2cfdc", - "sha256:e5dc4670fe5c3e0ef61e9556bbae4768fcb051f3deb73d89e40a1b470dc9bd88" + "sha256:7543f990231b0d605f35b9140ec51cdb6335e741bb8c45dea9b8746c248a54bf", + "sha256:bf9b5001273108864e09a330a0a1795c6caee6036a124a607916830544afbf5f", + "sha256:f1987811525120d68a420fc6142ca310df174123da56742233824342e3a8c3f8" ], "index": "ia", - "version": "==5.0.6" + "version": "==5.0.7" }, "pillow": { "hashes": [ + "sha256:02d2ae1d7c311e6fa038abda0843683ae652c9292d723270c85deeb04a1001a8", "sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b", + "sha256:1937c9e17f685fe6c360dd96ddb8f93f159ac721939ccbfc91a62d8124a29945", + "sha256:1ce73edaeb49af9ebeacfb8c58428ae39592839d3a7a16ef3926773f1c8ff8ee", + "sha256:2052f0372123c98497ee3294f4e20347d87b9f70d9c65ce2fc520b9339aa8465", "sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe", + "sha256:26b4a2bcdf0e674505fcd2f1a882f29a99339ac3b5a8e7997b90ede2995434e2", + "sha256:282e069f92e43047b34bbd995a8800669af11d038db571758708ebcd96462964", + "sha256:2ee30463cbc8e60cba92722a3a64881a0d3df534a047e299a9bcf62ea34fd061", "sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b", "sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608", "sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac", "sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389", + "sha256:4756b77682c7335751a2cfc0e9e6d96945d88ffd315420d9010235021ddfc64c", "sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff", + "sha256:502275e7a83872e62e0cd0be4da575a53a1f9703341aba814527dacdda3660a8", "sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f", + "sha256:5bcea0df97fe0b911a6629aab0997b98e8811561c27167266758a7ede173123e", "sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b", + "sha256:61cb7e1ff212293d74155397fad008b052bc9633efdf9caa4271c316cd25b99f", + "sha256:68e45ed91531d3b05a17a356fd4cb928603a00259916e057730a024b029d6b51", + "sha256:69c3cc797a66241ed2fa61ff6f52c73e7bd3e738d80d64abb3500e6fbdec30a3", + "sha256:69ec2f0effe8b395e55929bb4d9a3ca8ff0c40f85d61d00e1e5b1e504d28b5e2", "sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3", "sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05", + "sha256:82351254350d9212bccf71d387efae8ad8f6f4b904d095546a77852a6b16e05b", + "sha256:95236f64904157256254b6cc8e29feecd9ee6985732dcb36c9f58d7dabe081d0", "sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64", "sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7", "sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842", + "sha256:9e274583a0eab0b6d227139146e28f74488cfbc0d262c4ba2e5c0998b9c498d2", "sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a", + "sha256:a10befeb7b9975d7c3d2ca3eaf0cb505db98fe50874130e182c2a6f7a606591f", "sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3", + "sha256:b55f62882d8db466fcf2228422bf3147617744888bf0cf6dffb3254a52eb316b", + "sha256:bf83901c158ad92e77e990f51531434e5a96c6aef805a84b6e3bfe825f4d4d0c", + "sha256:c32f99a0c7c5313b2df78399ef908563b319de23bec0cc89f1d04c37be19eb2d", "sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984", + "sha256:cf4b3c634b317ee247c3add4375b0a6bdc45eb0c12a5d7fbf9bfd47ec10b020f", "sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3", "sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c", + "sha256:e04df3808d6202dd552c837c824796899c09ff0ff9c335607904e31f9d387110", "sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb", "sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0", + "sha256:eae3711a7916eb5ec800dfb6963da09db0ada63c0481639dd0ddc0b505883a02", "sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c", "sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394" ], @@ -431,6 +469,7 @@ }, "pykafka": { "hashes": [ + "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459", "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577" ], "index": "ia", @@ -454,7 +493,6 @@ "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" ], - "markers": "python_version >= '2.7'", "version": "==2.8.1" }, "python-magic": { @@ -497,7 +535,6 @@ "sha256:e3a057b7a64f1222b56e47bcff5e4b94c4f61faac04c7c4ecb1985e18caa3994", "sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615" ], - "markers": "python_version != '3.2.*' and python_version != '3.4.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'", "version": "==5.3" }, "raven": { @@ -506,24 +543,22 @@ "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4" ], "index": "ia", - "markers": null, "version": "==6.10.0" }, "redis": { "hashes": [ - "sha256:3613daad9ce5951e426f460deddd5caf469e08a3af633e9578fc77d362becf62", - "sha256:8d0fc278d3f5e1249967cba2eb4a5632d19e45ce5c09442b8422d15ee2c22cc2" + "sha256:0dcfb335921b88a850d461dc255ff4708294943322bd55de6cfd68972490ca1f", + "sha256:b205cffd05ebfd0a468db74f0eedbff8df1a7bfc47521516ade4692991bb0833" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==3.3.11" + "version": "==3.4.1" }, "requests": { "hashes": [ - "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", - "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" ], "index": "ia", - "version": "==2.22.0" + "version": "==2.23.0" }, "requests-file": { "hashes": [ @@ -540,11 +575,10 @@ }, "s3transfer": { "hashes": [ - "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d", - "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba" + "sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13", + "sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", - "version": "==0.2.1" + "version": "==0.3.3" }, "schedule": { "hashes": [ @@ -562,25 +596,23 @@ }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==1.13.0" + "version": "==1.14.0" }, "soupsieve": { "hashes": [ - "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5", - "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda" + "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae", + "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69" ], - "version": "==1.9.5" + "version": "==2.0" }, "sqlalchemy": { "hashes": [ - "sha256:bfb8f464a5000b567ac1d350b9090cf081180ec1ab4aa87e7bca12dab25320ec" + "sha256:b92d2de62e43499d85b1780274d1b562e5159c7996f6f04a9bb46cf681ced45f" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==1.3.12" + "version": "==1.3.14" }, "surt": { "hashes": [ @@ -600,7 +632,6 @@ "sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7", "sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==2.2.2" }, "twitter": { @@ -610,12 +641,18 @@ ], "version": "==1.18.0" }, + "urlcanon": { + "hashes": [ + "sha256:30f5bf0e2e4a0feb6dd9ee139a4180a5d493117e8a1448569da3d73c18b92b62" + ], + "index": "ia", + "version": "==0.3.1" + }, "urllib3": { "hashes": [ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" ], - "markers": "python_version >= '3.4'", "version": "==1.22" }, "warctools": { @@ -626,10 +663,10 @@ }, "wayback": { "hashes": [ - "sha256:3e89df1a3cb49baffe03572a77d00d97d54ccebeb4dd24f19d8f2b8ec3812ad3" + "sha256:936ae4c75af922e0f4b1bc82c66f51f97687cc6a1b8f3f1a19ec8fa7ab11ec41" ], "index": "ia", - "version": "==0.5.1" + "version": "==0.5.3" }, "wayback-esp": { "hashes": [ @@ -647,7 +684,6 @@ "hashes": [ "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==0.1.5" }, "wcwidth": { @@ -659,11 +695,18 @@ }, "werkzeug": { "hashes": [ - "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7", - "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4" + "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096", + "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==0.16.0" + "version": "==1.0.0" + }, + "zipp": { + "hashes": [ + "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1", + "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921" + ], + "index": "ia", + "version": "==1.2.0" } }, "develop": { @@ -674,20 +717,11 @@ ], "version": "==2.3.3" }, - "atomicwrites": { - "hashes": [ - "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", - "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" - ], - "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'", - "version": "==1.3.0" - }, "attrs": { "hashes": [ "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==19.3.0" }, "backcall": { @@ -713,48 +747,46 @@ }, "coverage": { "hashes": [ - "sha256:189aac76d6e0d7af15572c51892e7326ee451c076c5a50a9d266406cd6c49708", - "sha256:1bf7ba2af1d373a1750888724f84cffdfc697738f29a353c98195f98fc011509", - "sha256:1f4ee8e2e4243971618bc16fcc4478317405205f135e95226c2496e2a3b8dbbf", - "sha256:225e79a5d485bc1642cb7ba02281419c633c216cdc6b26c26494ba959f09e69f", - "sha256:23688ff75adfa8bfa2a67254d889f9bdf9302c27241d746e17547c42c732d3f4", - "sha256:28f7f73b34a05e23758e860a89a7f649b85c6749e252eff60ebb05532d180e86", - "sha256:2d0cb9b1fe6ad0d915d45ad3d87f03a38e979093a98597e755930db1f897afae", - "sha256:47874b4711c5aeb295c31b228a758ce3d096be83dc37bd56da48ed99efb8813b", - "sha256:511ec0c00840e12fb4e852e4db58fa6a01ca4da72f36a9766fae344c3d502033", - "sha256:53e7438fef0c97bc248f88ba1edd10268cd94d5609970aaf87abbe493691af87", - "sha256:569f9ee3025682afda6e9b0f5bb14897c0db03f1a1dc088b083dd36e743f92bb", - "sha256:593853aa1ac6dcc6405324d877544c596c9d948ef20d2e9512a0f5d2d3202356", - "sha256:5b0a07158360d22492f9abd02a0f2ee7981b33f0646bf796598b7673f6bbab14", - "sha256:7ca3db38a61f3655a2613ee2c190d63639215a7a736d3c64cc7bbdb002ce6310", - "sha256:7d1cc7acc9ce55179616cf72154f9e648136ea55987edf84addbcd9886ffeba2", - "sha256:88b51153657612aea68fa684a5b88037597925260392b7bb4509d4f9b0bdd889", - "sha256:955ec084f549128fa2702f0b2dc696392001d986b71acd8fd47424f28289a9c3", - "sha256:b251c7092cbb6d789d62dc9c9e7c4fb448c9138b51285c36aeb72462cad3600e", - "sha256:bd82b684bb498c60ef47bb1541a50e6d006dde8579934dcbdbc61d67d1ea70d9", - "sha256:bfe102659e2ec13b86c7f3b1db6c9a4e7beea4255058d006351339e6b342d5d2", - "sha256:c1e4e39e43057396a5e9d069bfbb6ffeee892e40c5d2effbd8cd71f34ee66c4d", - "sha256:cb2b74c123f65e8166f7e1265829a6c8ed755c3cd16d7f50e75a83456a5f3fd7", - "sha256:cca38ded59105f7705ef6ffe1e960b8db6c7d8279c1e71654a4775ab4454ca15", - "sha256:cf908840896f7aa62d0ec693beb53264b154f972eb8226fb864ac38975590c4f", - "sha256:d095a7b473f8a95f7efe821f92058c8a2ecfb18f8db6677ae3819e15dc11aaae", - "sha256:d22b4297e7e4225ccf01f1aa55e7a96412ea0796b532dd614c3fcbafa341128e", - "sha256:d4a2b578a7a70e0c71f662705262f87a456f1e6c1e40ada7ea699abaf070a76d", - "sha256:ddeb42a3d5419434742bf4cc71c9eaa22df3b76808e23a82bd0b0bd360f1a9f1", - "sha256:e65a5aa1670db6263f19fdc03daee1d7dbbadb5cb67fd0a1f16033659db13c1d", - "sha256:eaad65bd20955131bcdb3967a4dea66b4e4d4ca488efed7c00d91ee0173387e8", - "sha256:f45fba420b94165c17896861bb0e8b27fb7abdcedfeb154895d8553df90b7b00" - ], - "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version < '4' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'", - "version": "==5.0.2" + "sha256:15cf13a6896048d6d947bf7d222f36e4809ab926894beb748fc9caa14605d9c3", + "sha256:1daa3eceed220f9fdb80d5ff950dd95112cd27f70d004c7918ca6dfc6c47054c", + "sha256:1e44a022500d944d42f94df76727ba3fc0a5c0b672c358b61067abb88caee7a0", + "sha256:25dbf1110d70bab68a74b4b9d74f30e99b177cde3388e07cc7272f2168bd1477", + "sha256:3230d1003eec018ad4a472d254991e34241e0bbd513e97a29727c7c2f637bd2a", + "sha256:3dbb72eaeea5763676a1a1efd9b427a048c97c39ed92e13336e726117d0b72bf", + "sha256:5012d3b8d5a500834783689a5d2292fe06ec75dc86ee1ccdad04b6f5bf231691", + "sha256:51bc7710b13a2ae0c726f69756cf7ffd4362f4ac36546e243136187cfcc8aa73", + "sha256:527b4f316e6bf7755082a783726da20671a0cc388b786a64417780b90565b987", + "sha256:722e4557c8039aad9592c6a4213db75da08c2cd9945320220634f637251c3894", + "sha256:76e2057e8ffba5472fd28a3a010431fd9e928885ff480cb278877c6e9943cc2e", + "sha256:77afca04240c40450c331fa796b3eab6f1e15c5ecf8bf2b8bee9706cd5452fef", + "sha256:7afad9835e7a651d3551eab18cbc0fdb888f0a6136169fbef0662d9cdc9987cf", + "sha256:9bea19ac2f08672636350f203db89382121c9c2ade85d945953ef3c8cf9d2a68", + "sha256:a8b8ac7876bc3598e43e2603f772d2353d9931709345ad6c1149009fd1bc81b8", + "sha256:b0840b45187699affd4c6588286d429cd79a99d509fe3de0f209594669bb0954", + "sha256:b26aaf69713e5674efbde4d728fb7124e429c9466aeaf5f4a7e9e699b12c9fe2", + "sha256:b63dd43f455ba878e5e9f80ba4f748c0a2156dde6e0e6e690310e24d6e8caf40", + "sha256:be18f4ae5a9e46edae3f329de2191747966a34a3d93046dbdf897319923923bc", + "sha256:c312e57847db2526bc92b9bfa78266bfbaabac3fdcd751df4d062cd4c23e46dc", + "sha256:c60097190fe9dc2b329a0eb03393e2e0829156a589bd732e70794c0dd804258e", + "sha256:c62a2143e1313944bf4a5ab34fd3b4be15367a02e9478b0ce800cb510e3bbb9d", + "sha256:cc1109f54a14d940b8512ee9f1c3975c181bbb200306c6d8b87d93376538782f", + "sha256:cd60f507c125ac0ad83f05803063bed27e50fa903b9c2cfee3f8a6867ca600fc", + "sha256:d513cc3db248e566e07a0da99c230aca3556d9b09ed02f420664e2da97eac301", + "sha256:d649dc0bcace6fcdb446ae02b98798a856593b19b637c1b9af8edadf2b150bea", + "sha256:d7008a6796095a79544f4da1ee49418901961c97ca9e9d44904205ff7d6aa8cb", + "sha256:da93027835164b8223e8e5af2cf902a4c80ed93cb0909417234f4a9df3bcd9af", + "sha256:e69215621707119c6baf99bda014a45b999d37602cb7043d943c76a59b05bf52", + "sha256:ea9525e0fef2de9208250d6c5aeeee0138921057cd67fcef90fbed49c4d62d37", + "sha256:fca1669d464f0c9831fd10be2eef6b86f5ebd76c724d1e0706ebdff86bb4adf0" + ], + "version": "==5.0.3" }, "decorator": { "hashes": [ - "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce", - "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d" + "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760", + "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==4.4.1" + "version": "==4.4.2" }, "idna": { "hashes": [ @@ -765,11 +797,11 @@ }, "importlib-metadata": { "hashes": [ - "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45", - "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f" + "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302", + "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b" ], "markers": "python_version < '3.8'", - "version": "==1.3.0" + "version": "==1.5.0" }, "ipython": { "hashes": [ @@ -791,15 +823,14 @@ "sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1", "sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==4.3.21" }, "jedi": { "hashes": [ - "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064", - "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807" + "sha256:b4f4052551025c6b0b0b193b29a6ff7bdb74c52450631206c262aef9f7159ad2", + "sha256:d5c871cb9360b414f981e7072c52c33258d598305280fef91c6cae34739d65d5" ], - "version": "==0.15.2" + "version": "==0.16.0" }, "lazy-object-proxy": { "hashes": [ @@ -825,7 +856,6 @@ "sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4", "sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==1.4.3" }, "mccabe": { @@ -837,26 +867,24 @@ }, "more-itertools": { "hashes": [ - "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d", - "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564" + "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c", + "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507" ], - "markers": "python_version > '2.7'", - "version": "==8.0.2" + "version": "==8.2.0" }, "packaging": { "hashes": [ - "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb", - "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8" + "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3", + "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==20.0" + "version": "==20.3" }, "parso": { "hashes": [ - "sha256:55cf25df1a35fd88b878715874d2c4dc1ad3f0eebd1e0266a67e1f55efccfbe1", - "sha256:5c1f7791de6bd5dbbeac8db0ef5594b36799de198b3f7f7014643b0c5536b9d3" + "sha256:0c5659e0c6eba20636f99a04f469798dca8da279645ce5c387315b2c23912157", + "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995" ], - "version": "==0.5.2" + "version": "==0.6.2" }, "pathlib2": { "hashes": [ @@ -868,11 +896,11 @@ }, "pexpect": { "hashes": [ - "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", - "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" + "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937", + "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c" ], "markers": "sys_platform != 'win32'", - "version": "==4.7.0" + "version": "==4.8.0" }, "pickleshare": { "hashes": [ @@ -886,7 +914,6 @@ "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" ], - "markers": "python_version >= '3.5'", "version": "==0.13.1" }, "prompt-toolkit": { @@ -895,7 +922,6 @@ "sha256:dd4fca02c8069497ad931a2d09914c6b0d1b50151ce876bc15bde4c747090126", "sha256:f7eec66105baf40eda9ab026cd8b2e251337eea8d111196695d82e0c5f0af852" ], - "markers": "python_version != '3.1.*' and python_version >= '2.6' and python_version != '3.0.*'", "version": "==1.0.18" }, "ptyprocess": { @@ -910,16 +936,14 @@ "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa", "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==1.8.1" }, "pygments": { "hashes": [ - "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b", - "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe" + "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44", + "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324" ], - "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'", - "version": "==2.5.2" + "version": "==2.6.1" }, "pylint": { "hashes": [ @@ -934,16 +958,15 @@ "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f", "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", "version": "==2.4.6" }, "pytest": { "hashes": [ - "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa", - "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4" + "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d", + "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6" ], "index": "ia", - "version": "==5.3.2" + "version": "==5.3.5" }, "pytest-cov": { "hashes": [ @@ -963,12 +986,11 @@ }, "pytest-pylint": { "hashes": [ - "sha256:8c38ea779e540e27ec4378b0820d906006e09f4ac834defbd886abbf57c7d2ec", - "sha256:a4f5e5007f88c2095dcac799e9f7eed3d7e7a2e657596e26093814980ff5fa20", - "sha256:a574c246535308f8f6ceac10fa82f8fffffa837071f7985b22515895185700c1" + "sha256:cac5d565182f39fbb7fa7f4ef1bbcc979e8f5cc260450ec72dc5aafeb782531f", + "sha256:dd3e232da5703e7fd14c610247dbe25dfd8e3278069b4b8bcf9778ba06b77569" ], "index": "ia", - "version": "==0.14.1" + "version": "==0.15.1" }, "pytest-pythonpath": { "hashes": [ @@ -979,19 +1001,19 @@ }, "requests": { "hashes": [ - "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", - "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" ], "index": "ia", - "version": "==2.22.0" + "version": "==2.23.0" }, "responses": { "hashes": [ - "sha256:515fd7c024097e5da76e9c4cf719083d181f1c3ddc09c2e0e49284ce863dd263", - "sha256:8ce8cb4e7e1ad89336f8865af152e0563d2e7f0e0b86d2cf75f015f819409243" + "sha256:0474ce3c897fbcc1aef286117c93499882d5c440f06a805947e4b1cb5ab3d474", + "sha256:f83613479a021e233e82d52ffb3e2e0e2836d24b0cc88a0fa31978789f78d0e5" ], "index": "ia", - "version": "==0.10.9" + "version": "==0.10.12" }, "simplegeneric": { "hashes": [ @@ -1001,52 +1023,50 @@ }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==1.13.0" + "version": "==1.14.0" }, "traitlets": { "hashes": [ "sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44", "sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", "version": "==4.3.3" }, "typed-ast": { "hashes": [ - "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", - "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", - "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", - "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", - "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", - "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", - "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", - "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", - "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", - "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", - "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", - "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", - "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", - "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", - "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", - "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", - "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", - "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", - "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", - "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" + "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", + "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", + "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", + "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", + "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", + "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", + "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", + "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", + "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", + "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", + "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", + "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", + "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", + "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", + "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", + "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", + "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", + "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", + "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", + "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", + "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" ], "markers": "implementation_name == 'cpython' and python_version < '3.8'", - "version": "==1.4.0" + "version": "==1.4.1" }, "urllib3": { "hashes": [ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" ], - "markers": "python_version >= '3.4'", "version": "==1.22" }, "wcwidth": { @@ -1064,11 +1084,11 @@ }, "zipp": { "hashes": [ - "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", - "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1", + "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921" ], - "markers": "python_version >= '3.5'", - "version": "==0.6.0" + "index": "ia", + "version": "==1.2.0" } } } diff --git a/python/ingest_file.py b/python/ingest_file.py index d4fdcac..f6f694e 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -17,7 +17,9 @@ def run_single_ingest(args): ) if args.force_recrawl: request['force_recrawl'] = True - ingester = IngestFileWorker() + ingester = IngestFileWorker( + try_spn2=not args.no_spn2, + ) result = ingester.process(request) print(json.dumps(result, sort_keys=True)) return result @@ -51,6 +53,9 @@ def main(): sub_single.add_argument('--force-recrawl', action='store_true', help="ignore GWB history and use SPNv2 to re-crawl") + sub_single.add_argument('--no-spn2', + action='store_true', + help="don't use live web (SPNv2)") sub_single.add_argument('--type', default="pdf", help="type of ingest (pdf, html, etc)") diff --git a/python/persist_tool.py b/python/persist_tool.py index 19e6dd7..869af06 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -41,6 +41,7 @@ def run_grobid(args): s3_access_key=args.s3_access_key, s3_secret_key=args.s3_secret_key, s3_only=args.s3_only, + db_only=args.db_only, ) pusher = JsonLinePusher( worker, @@ -135,6 +136,9 @@ def main(): sub_grobid.add_argument('--s3-only', action='store_true', help="only upload TEI-XML to S3 (don't write to database)") + sub_grobid.add_argument('--db-only', + action='store_true', + help="only write status to sandcrawler-db (don't save TEI-XML to S3)") sub_grobid_disk = subparsers.add_parser('grobid-disk', help="dump GRBOID output to (local) files on disk") diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 3d49096..492b558 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,7 +1,7 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker -from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime +from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 99e92be..25697be 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -575,6 +575,7 @@ class WaybackClient: next_url = start_url urls_seen = [start_url] for i in range(self.max_redirects): + print(" URL: {}".format(next_url), file=sys.stderr) cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype) #print(cdx_row, file=sys.stderr) if not cdx_row: @@ -659,7 +660,12 @@ class WaybackClient: cdx=cdx_row, revisit_cdx=None, ) - next_url = resource.location + if resource.location.startswith('/'): + # redirect location does not include hostname + domain_prefix = '/'.join(next_url.split('/')[:3]) + next_url = domain_prefix + resource.location + else: + next_url = resource.location else: next_url = self.fetch_replay_redirect( url=cdx_row.url, @@ -753,7 +759,7 @@ class SavePageNowClient: self.poll_count = 60 self.poll_seconds = 3.0 - def save_url_now_v2(self, request_url): + def save_url_now_v2(self, request_url, force_get=0): """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. @@ -792,6 +798,7 @@ class SavePageNowClient: 'capture_all': 1, 'capture_screenshot': 0, 'if_not_archived_within': '1d', + 'force_get': force_get, }, ) if resp.status_code == 429: @@ -861,14 +868,14 @@ class SavePageNowClient: None, ) - def crawl_resource(self, start_url, wayback_client): + def crawl_resource(self, start_url, wayback_client, force_get=0): """ Runs a SPN2 crawl, then fetches body from wayback. TODO: possible to fetch from petabox? """ - spn_result = self.save_url_now_v2(start_url) + spn_result = self.save_url_now_v2(start_url, force_get=force_get) if not spn_result.success: status = spn_result.status diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 7211ee0..c9a697c 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -9,7 +9,7 @@ from collections import namedtuple from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult from sandcrawler.grobid import GrobidClient -from sandcrawler.misc import gen_file_metadata +from sandcrawler.misc import gen_file_metadata, clean_url from sandcrawler.html import extract_fulltext_url from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient @@ -93,6 +93,15 @@ class IngestFileWorker(SandcrawlerWorker): "digital.ucd.ie/", # ireland national historical ] + # these are special-case web domains for which we want SPN2 to not run + # a headless browser (brozzler), but instead simply run wget. + # the motivation could be to work around browser issues, or in the + # future possibly to increase download efficiency (wget/fetch being + # faster than browser fetch) + self.spn2_simple_get_domains = [ + ] + + def check_existing_ingest(self, base_url): """ Check in sandcrawler-db (postgres) to see if we have already ingested @@ -138,7 +147,12 @@ class IngestFileWorker(SandcrawlerWorker): if self.try_spn2 and (not resource or not resource.hit or soft404): via = "spn2" - resource = self.spn_client.crawl_resource(url, self.wayback_client) + force_get = 0 + for domain in self.spn2_simple_get_domains: + if domain in url: + force_get = 1 + break + resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get) print("[FETCH {}\t] {}\t{}".format( via, resource.status, @@ -224,7 +238,11 @@ class IngestFileWorker(SandcrawlerWorker): request['ingest_type'] = "pdf" assert request.get('ingest_type') == "pdf" ingest_type = request.get('ingest_type') - base_url = request['base_url'] + + # parse/clean URL + # note that we pass through the original/raw URL, and that is what gets + # persisted in database table + base_url = clean_url(request['base_url']) force_recrawl = bool(request.get('force_recrawl', False)) diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 88669e6..d9c9d55 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -6,8 +6,15 @@ import datetime import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error +import urlcanon +def clean_url(s): + parsed = urlcanon.parse_url(s) + if not parsed.port and parsed.colon_before_port: + parsed.colon_before_port = b'' + return str(urlcanon.whatwg(parsed)) + def gen_file_metadata(blob): """ Takes a file blob (bytestream) and returns hashes and other metadata. diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index f5de44a..379fd8b 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -240,6 +240,8 @@ class PersistGrobidWorker(SandcrawlerWorker): default_bucket=kwargs['s3_bucket'], ) self.s3_only = kwargs.get('s3_only', False) + self.db_only = kwargs.get('db_only', False) + assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" def process(self, record): """ @@ -264,13 +266,14 @@ class PersistGrobidWorker(SandcrawlerWorker): continue assert len(r['key']) == 40 - resp = self.s3.put_blob( - folder="grobid", - blob=r['tei_xml'], - sha1hex=r['key'], - extension=".tei.xml", - ) - self.counts['s3-put'] += 1 + if not self.db_only: + resp = self.s3.put_blob( + folder="grobid", + blob=r['tei_xml'], + sha1hex=r['key'], + extension=".tei.xml", + ) + self.counts['s3-put'] += 1 # enhance with teixml2json metadata, if available try: diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index c290421..54bd581 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -52,6 +52,12 @@ class SandcrawlerWorker(object): print("Worker: {}".format(self.counts), file=sys.stderr) return self.counts + def process(self, task): + """ + Derived workers need to implement business logic here. + """ + raise NotImplementedError('implementation required') + class MultiprocessWrapper(SandcrawlerWorker): def __init__(self, worker, sink, jobs=None): diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 0ba4d03..5720f48 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -58,6 +58,7 @@ def run_persist_grobid(args): s3_access_key=args.s3_access_key, s3_secret_key=args.s3_secret_key, s3_only=args.s3_only, + db_only=args.db_only, ) pusher = KafkaJsonPusher( worker=worker, @@ -173,6 +174,9 @@ def main(): sub_persist_grobid.add_argument('--s3-only', action='store_true', help="only upload TEI-XML to S3 (don't write to database)") + sub_persist_grobid.add_argument('--db-only', + action='store_true', + help="only write status to database (don't upload TEI-XML to S3)") sub_persist_grobid.set_defaults(func=run_persist_grobid) sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio', diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index fa46f10..03a1f29 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ This script is intended to be used for backfill ingest of old crawls. It can diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 563855d..494ec7a 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ This script is used to turn ingest request postgres rows (in JSON export diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index c51a152..2999574 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ Transform an unpaywall dump (JSON) into ingest requests. diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 420bc07..29f9e9f 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,7 +1,7 @@ import pytest -from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line +from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url def test_gen_file_metadata(): @@ -69,3 +69,9 @@ def test_invalid_cdx(): print("bad datetime") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" assert parse_cdx_line(raw) == None + +def test_clean_url(): + assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf" + assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \ + "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view" + |