diff options
-rw-r--r-- | python/Pipfile | 4 | ||||
-rw-r--r-- | python/Pipfile.lock | 438 | ||||
-rwxr-xr-x | python/ingest_file.py | 7 | ||||
-rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 8 | ||||
-rw-r--r-- | python/sandcrawler/misc.py | 7 | ||||
-rwxr-xr-x | python/scripts/arabesque2ingestrequest.py | 2 | ||||
-rwxr-xr-x | python/scripts/ingestrequest_row2json.py | 2 | ||||
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 2 | ||||
-rw-r--r-- | python/tests/test_misc.py | 8 | ||||
-rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 1 | ||||
-rw-r--r-- | sql/monitoring_queries.md | 94 | ||||
-rw-r--r-- | sql/random_queries.md | 20 |
13 files changed, 375 insertions, 220 deletions
diff --git a/python/Pipfile b/python/Pipfile index f7e59c5..fc63697 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -34,6 +34,10 @@ python-magic = "*" ftfy = "*" internetarchive = "*" Flask = ">=1" +urlcanon = "*" + +# this is only to lock to a python3.5-compatible version +zipp = "<2.0.0" [requires] python_version = "3.5" diff --git a/python/Pipfile.lock b/python/Pipfile.lock index 4de99d5..07e7484 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "87b092df5b5472b2b42e8d4c42e73319968c36729cc1bd3fb6de0eb3346f2a6b" + "sha256": "ac2202b1be9e35ebd698dd609af8d868afb3fdd0134c5581925138b51d4f475f" }, "pipfile-spec": 6, "requires": { @@ -21,6 +21,22 @@ ] }, "default": { + "aerospike": { + "hashes": [ + "sha256:3c3edb9c59491100cf5f9b0d802ee0b812b32b626c16358133cf5b9931ab8620", + "sha256:42e6ed4f6298aab4e5094f45a69fc805f925fbaa4ec206a87ce0a2048df02d4d", + "sha256:67684fb6af531765eb6061e37597bc73a348a2eff141795447ab20d9c6a61289", + "sha256:6aec5e0dbedb8ddd97441abaebedb04d4abbd51bfcfd6f0a6722fabc5be4efd0", + "sha256:9280ecb0257b0b706df7ac934dc03f518641934479d9c925a46af5231fb65f40", + "sha256:98779725a86ef345b9fec0b5ef60b59b2430b9c8c8e8904adb7945af6d6f9ffb", + "sha256:99de79a26f184a47a67123899e093cecd5c3bc0b0ce92da4f302684ad0b0116c", + "sha256:b170b637d69f49c02d021477359866c3d89a2c0d1477bec19343828f890d3cb1", + "sha256:d2f0b0288e2efafb99bbada6b39714285f317dc47fde3c4458b76e8cfbb71c11", + "sha256:d83faa27d40af320058a93902e19173f6295acbcc9ca225c552d7648169859f0", + "sha256:db7efad41300cb9bd6e70534c3110fce4e474db6d4288428609e0214a021aab8" + ], + "version": "==3.10.0" + }, "args": { "hashes": [ "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814" @@ -51,19 +67,18 @@ }, "boto3": { "hashes": [ - "sha256:5c00d51101d6a7ddf2207ae8a738e5c815c5fcffbee76121f38bd41d83c936a5", - "sha256:aa58c8de6aed36211e0897598de2a3d89122ad8cd1450165679720180ab880ef" + "sha256:5e145fa27b56c6f3db20c20d4e0084e146a400007064af8d46c1b3bfcc779c42", + "sha256:95d2fb21e3f0575f8b599706100c42a5a7ae576671f6f4beea6e42ab0f57166b" ], "index": "ia", - "version": "==1.10.50" + "version": "==1.12.18" }, "botocore": { "hashes": [ - "sha256:765a5c637ff792239727c327b221ed5a4d851e9f176ce8b8b9eca536425c74d4", - "sha256:adb4cb188cd0866e7337f9a049fc68db042b0340fd496d40bca349c8dbfc6a2d" + "sha256:2f1a54e19531bdf2d953c2db5f76b49c6936565366b2503a528b04cbbf55671a", + "sha256:9827c6f3bffb9e316427c276e3f22e5d82b377dd43dcbbbe71ecbd9ac2b959ab" ], - "markers": "python_version >= '2.6' and python_version != '3.1.*' and python_version != '3.0.*'", - "version": "==1.13.50" + "version": "==1.15.18" }, "brotli": { "hashes": [ @@ -126,11 +141,10 @@ }, "click": { "hashes": [ - "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", - "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" + "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc", + "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a" ], - "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'", - "version": "==7.0" + "version": "==7.1.1" }, "clint": { "hashes": [ @@ -138,6 +152,13 @@ ], "version": "==0.5.1" }, + "configparser": { + "hashes": [ + "sha256:254c1d9c79f60c45dfde850850883d5aaa7f19a23f13561243a050d5a7c3fe4c", + "sha256:c7d282687a5308319bf3d2e7706e575c635b0a470342641c93bea0ea3b5331df" + ], + "version": "==4.0.2" + }, "confluent-kafka": { "hashes": [ "sha256:0efd716da4f03f99d45fbb0d1583c5c8bf1eabc258a883588e3cd6ee06c0facb", @@ -187,31 +208,28 @@ "hashes": [ "sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.1.4.8" }, "dawg": { "hashes": [ - "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5", - "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953", - "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e", - "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8", - "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3", - "sha256:7d0a904e91adfa3de7071bfe64cd1334ce4040f1795cca8c13598bd075e72e18", - "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55", - "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9", - "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171", - "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9" + "sha256:28c4c934ab1ca74226a46e6213f919f5b0912bf9de87218264d4d94c15521753", + "sha256:34881e06278d4a54cf0b402c0c8b587bef0caa78f0eee595adc7a2aa530e48ce", + "sha256:73760ad1272b1b47997f1a768b8f3bf547c92475bcd62185f4ab7e1bc691964e", + "sha256:7aecc4c89243edaf1efe7a4d769d993a7cd9307a8a04f48e07c4fc7c44bdd38f", + "sha256:83ce4a73f7632b0ed31af16c2750533ecbed347bad1148a52f6436e348b5b7ac", + "sha256:d78929f5a7f7e083f5720992068535d133f0d3326f0c677c61c59256aa43d95e", + "sha256:e664a884ca48f2599ad5c2289d9b7f769e77d266560c79992e3db2cfce96cb1b", + "sha256:fb90b799fb7d6d728531840529c812a9ee17736da71e8a596ede8bfd6c62bf36", + "sha256:feb6073e0d02ac54389ad378e6c695e28fe579e2772c225a854299752effece6" ], - "version": "==0.7.8" + "version": "==0.8.0" }, "decorator": { "hashes": [ - "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce", - "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d" + "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760", + "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==4.4.1" + "version": "==4.4.2" }, "docopt": { "hashes": [ @@ -225,14 +243,12 @@ "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" ], - "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.15.2" }, "dogpile.cache": { "hashes": [ "sha256:b348835825c9dcd251d9aad1f89f257277ac198a3e35a61980ab4cb28c75216b" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==0.9.0" }, "elasticsearch": { @@ -240,7 +256,6 @@ "sha256:1f0f633e3b500d5042424f75a505badf8c4b9962c1b4734cdfb3087fb67920be", "sha256:fb5ab15ee283f104b5a7a5695c7e879cb2927e4eb5aed9c530811590b41259ad" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version < '4' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'", "version": "==6.4.0" }, "flask": { @@ -253,23 +268,22 @@ }, "ftfy": { "hashes": [ - "sha256:6d7509c45e602dec890f0f6ee0623a8b5f50ec1188ac7ab9535e18e572c99bcc" + "sha256:67f9c8b33a4b742376a3eda11b0e3bd5c0cbe719d95ea0bfd3736a7bdd1c24c8" ], "index": "ia", - "version": "==5.6" + "version": "==5.7" }, "globalwayback": { "hashes": [ - "sha256:ddd1fac7caad4181e8e623cb67ef3f6a6f7c0f306140c450b92f8bb3032aba51" + "sha256:6f7bd270ef827c1d8c1d5631c313dbf401e300993b593179699741fcdbc5295d" ], "index": "ia", - "version": "==0.4.13" + "version": "==0.5.2" }, "ialib": { "hashes": [ "sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==0.3.0.1" }, "idna": { @@ -292,38 +306,34 @@ "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19", "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749" ], - "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'", "version": "==1.1.0" }, "jinja2": { "hashes": [ - "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", - "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250", + "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==2.10.3" + "version": "==2.11.1" }, "jmespath": { "hashes": [ - "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6", - "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c" + "sha256:695cb76fa78a10663425d5b73ddc5714eb711157e52704d69be03b1a02ba4fec", + "sha256:cca55c8d153173e21baa59983015ad0daf603f9cb799904ff057bfb8ff8dc2d9" ], - "version": "==0.9.4" + "version": "==0.9.5" }, "jsonpatch": { "hashes": [ - "sha256:83f29a2978c13da29bfdf89da9d65542d62576479caf215df19632d7dc04c6e6", - "sha256:cbb72f8bf35260628aea6b508a107245f757d1ec839a19c34349985e2c05645a" + "sha256:cc3a7241010a1fd3f50145a3b33be2c03c1e679faa19934b628bb07d0f64819e", + "sha256:ddc0f7628b8bfdd62e3cbfbc24ca6671b0b6265b50d186c2cf3659dc0f78fd6a" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==1.24" + "version": "==1.25" }, "jsonpointer": { "hashes": [ "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362", "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", "version": "==2.0" }, "kazoo": { @@ -331,7 +341,6 @@ "sha256:8db774f7bdece7d0dc7decb21539ff0852e42c2ffe1c28d7f1ff6f9292a1c3a4", "sha256:a5fa2e400c5068cfee9e86b35cf0dab8232b574152d8e3590d823b3e2426ab5e" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==2.5.0" }, "markupsafe": { @@ -340,13 +349,16 @@ "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", @@ -363,42 +375,68 @@ "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", "version": "==1.1.1" }, "minio": { "hashes": [ - "sha256:daef713dfaaa232a719c4dc58f0bed90b6d11ab14e94cd013cef7e7ed5b2cfdc", - "sha256:e5dc4670fe5c3e0ef61e9556bbae4768fcb051f3deb73d89e40a1b470dc9bd88" + "sha256:7543f990231b0d605f35b9140ec51cdb6335e741bb8c45dea9b8746c248a54bf", + "sha256:bf9b5001273108864e09a330a0a1795c6caee6036a124a607916830544afbf5f", + "sha256:f1987811525120d68a420fc6142ca310df174123da56742233824342e3a8c3f8" ], "index": "ia", - "version": "==5.0.6" + "version": "==5.0.7" }, "pillow": { "hashes": [ + "sha256:02d2ae1d7c311e6fa038abda0843683ae652c9292d723270c85deeb04a1001a8", "sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b", + "sha256:1937c9e17f685fe6c360dd96ddb8f93f159ac721939ccbfc91a62d8124a29945", + "sha256:1ce73edaeb49af9ebeacfb8c58428ae39592839d3a7a16ef3926773f1c8ff8ee", + "sha256:2052f0372123c98497ee3294f4e20347d87b9f70d9c65ce2fc520b9339aa8465", "sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe", + "sha256:26b4a2bcdf0e674505fcd2f1a882f29a99339ac3b5a8e7997b90ede2995434e2", + "sha256:282e069f92e43047b34bbd995a8800669af11d038db571758708ebcd96462964", + "sha256:2ee30463cbc8e60cba92722a3a64881a0d3df534a047e299a9bcf62ea34fd061", "sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b", "sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608", "sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac", "sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389", + "sha256:4756b77682c7335751a2cfc0e9e6d96945d88ffd315420d9010235021ddfc64c", "sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff", + "sha256:502275e7a83872e62e0cd0be4da575a53a1f9703341aba814527dacdda3660a8", "sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f", + "sha256:5bcea0df97fe0b911a6629aab0997b98e8811561c27167266758a7ede173123e", "sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b", + "sha256:61cb7e1ff212293d74155397fad008b052bc9633efdf9caa4271c316cd25b99f", + "sha256:68e45ed91531d3b05a17a356fd4cb928603a00259916e057730a024b029d6b51", + "sha256:69c3cc797a66241ed2fa61ff6f52c73e7bd3e738d80d64abb3500e6fbdec30a3", + "sha256:69ec2f0effe8b395e55929bb4d9a3ca8ff0c40f85d61d00e1e5b1e504d28b5e2", "sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3", "sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05", + "sha256:82351254350d9212bccf71d387efae8ad8f6f4b904d095546a77852a6b16e05b", + "sha256:95236f64904157256254b6cc8e29feecd9ee6985732dcb36c9f58d7dabe081d0", "sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64", "sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7", "sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842", + "sha256:9e274583a0eab0b6d227139146e28f74488cfbc0d262c4ba2e5c0998b9c498d2", "sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a", + "sha256:a10befeb7b9975d7c3d2ca3eaf0cb505db98fe50874130e182c2a6f7a606591f", "sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3", + "sha256:b55f62882d8db466fcf2228422bf3147617744888bf0cf6dffb3254a52eb316b", + "sha256:bf83901c158ad92e77e990f51531434e5a96c6aef805a84b6e3bfe825f4d4d0c", + "sha256:c32f99a0c7c5313b2df78399ef908563b319de23bec0cc89f1d04c37be19eb2d", "sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984", + "sha256:cf4b3c634b317ee247c3add4375b0a6bdc45eb0c12a5d7fbf9bfd47ec10b020f", "sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3", "sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c", + "sha256:e04df3808d6202dd552c837c824796899c09ff0ff9c335607904e31f9d387110", "sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb", "sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0", + "sha256:eae3711a7916eb5ec800dfb6963da09db0ada63c0481639dd0ddc0b505883a02", "sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c", "sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394" ], @@ -431,6 +469,7 @@ }, "pykafka": { "hashes": [ + "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459", "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577" ], "index": "ia", @@ -454,7 +493,6 @@ "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" ], - "markers": "python_version >= '2.7'", "version": "==2.8.1" }, "python-magic": { @@ -497,7 +535,6 @@ "sha256:e3a057b7a64f1222b56e47bcff5e4b94c4f61faac04c7c4ecb1985e18caa3994", "sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615" ], - "markers": "python_version != '3.2.*' and python_version != '3.4.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'", "version": "==5.3" }, "raven": { @@ -506,24 +543,22 @@ "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4" ], "index": "ia", - "markers": null, "version": "==6.10.0" }, "redis": { "hashes": [ - "sha256:3613daad9ce5951e426f460deddd5caf469e08a3af633e9578fc77d362becf62", - "sha256:8d0fc278d3f5e1249967cba2eb4a5632d19e45ce5c09442b8422d15ee2c22cc2" + "sha256:0dcfb335921b88a850d461dc255ff4708294943322bd55de6cfd68972490ca1f", + "sha256:b205cffd05ebfd0a468db74f0eedbff8df1a7bfc47521516ade4692991bb0833" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==3.3.11" + "version": "==3.4.1" }, "requests": { "hashes": [ - "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", - "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" ], "index": "ia", - "version": "==2.22.0" + "version": "==2.23.0" }, "requests-file": { "hashes": [ @@ -540,11 +575,10 @@ }, "s3transfer": { "hashes": [ - "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d", - "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba" + "sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13", + "sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", - "version": "==0.2.1" + "version": "==0.3.3" }, "schedule": { "hashes": [ @@ -562,25 +596,23 @@ }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==1.13.0" + "version": "==1.14.0" }, "soupsieve": { "hashes": [ - "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5", - "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda" + "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae", + "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69" ], - "version": "==1.9.5" + "version": "==2.0" }, "sqlalchemy": { "hashes": [ - "sha256:bfb8f464a5000b567ac1d350b9090cf081180ec1ab4aa87e7bca12dab25320ec" + "sha256:b92d2de62e43499d85b1780274d1b562e5159c7996f6f04a9bb46cf681ced45f" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==1.3.12" + "version": "==1.3.14" }, "surt": { "hashes": [ @@ -600,7 +632,6 @@ "sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7", "sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808" ], - "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'", "version": "==2.2.2" }, "twitter": { @@ -610,12 +641,18 @@ ], "version": "==1.18.0" }, + "urlcanon": { + "hashes": [ + "sha256:30f5bf0e2e4a0feb6dd9ee139a4180a5d493117e8a1448569da3d73c18b92b62" + ], + "index": "ia", + "version": "==0.3.1" + }, "urllib3": { "hashes": [ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" ], - "markers": "python_version >= '3.4'", "version": "==1.22" }, "warctools": { @@ -626,10 +663,10 @@ }, "wayback": { "hashes": [ - "sha256:3e89df1a3cb49baffe03572a77d00d97d54ccebeb4dd24f19d8f2b8ec3812ad3" + "sha256:936ae4c75af922e0f4b1bc82c66f51f97687cc6a1b8f3f1a19ec8fa7ab11ec41" ], "index": "ia", - "version": "==0.5.1" + "version": "==0.5.3" }, "wayback-esp": { "hashes": [ @@ -647,7 +684,6 @@ "hashes": [ "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'", "version": "==0.1.5" }, "wcwidth": { @@ -659,11 +695,18 @@ }, "werkzeug": { "hashes": [ - "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7", - "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4" + "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096", + "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16" ], - "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'", - "version": "==0.16.0" + "version": "==1.0.0" + }, + "zipp": { + "hashes": [ + "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1", + "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921" + ], + "index": "ia", + "version": "==1.2.0" } }, "develop": { @@ -674,20 +717,11 @@ ], "version": "==2.3.3" }, - "atomicwrites": { - "hashes": [ - "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", - "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" - ], - "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'", - "version": "==1.3.0" - }, "attrs": { "hashes": [ "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==19.3.0" }, "backcall": { @@ -713,48 +747,46 @@ }, "coverage": { "hashes": [ - "sha256:189aac76d6e0d7af15572c51892e7326ee451c076c5a50a9d266406cd6c49708", - "sha256:1bf7ba2af1d373a1750888724f84cffdfc697738f29a353c98195f98fc011509", - "sha256:1f4ee8e2e4243971618bc16fcc4478317405205f135e95226c2496e2a3b8dbbf", - "sha256:225e79a5d485bc1642cb7ba02281419c633c216cdc6b26c26494ba959f09e69f", - "sha256:23688ff75adfa8bfa2a67254d889f9bdf9302c27241d746e17547c42c732d3f4", - "sha256:28f7f73b34a05e23758e860a89a7f649b85c6749e252eff60ebb05532d180e86", - "sha256:2d0cb9b1fe6ad0d915d45ad3d87f03a38e979093a98597e755930db1f897afae", - "sha256:47874b4711c5aeb295c31b228a758ce3d096be83dc37bd56da48ed99efb8813b", - "sha256:511ec0c00840e12fb4e852e4db58fa6a01ca4da72f36a9766fae344c3d502033", - "sha256:53e7438fef0c97bc248f88ba1edd10268cd94d5609970aaf87abbe493691af87", - "sha256:569f9ee3025682afda6e9b0f5bb14897c0db03f1a1dc088b083dd36e743f92bb", - "sha256:593853aa1ac6dcc6405324d877544c596c9d948ef20d2e9512a0f5d2d3202356", - "sha256:5b0a07158360d22492f9abd02a0f2ee7981b33f0646bf796598b7673f6bbab14", - "sha256:7ca3db38a61f3655a2613ee2c190d63639215a7a736d3c64cc7bbdb002ce6310", - "sha256:7d1cc7acc9ce55179616cf72154f9e648136ea55987edf84addbcd9886ffeba2", - "sha256:88b51153657612aea68fa684a5b88037597925260392b7bb4509d4f9b0bdd889", - "sha256:955ec084f549128fa2702f0b2dc696392001d986b71acd8fd47424f28289a9c3", - "sha256:b251c7092cbb6d789d62dc9c9e7c4fb448c9138b51285c36aeb72462cad3600e", - "sha256:bd82b684bb498c60ef47bb1541a50e6d006dde8579934dcbdbc61d67d1ea70d9", - "sha256:bfe102659e2ec13b86c7f3b1db6c9a4e7beea4255058d006351339e6b342d5d2", - "sha256:c1e4e39e43057396a5e9d069bfbb6ffeee892e40c5d2effbd8cd71f34ee66c4d", - "sha256:cb2b74c123f65e8166f7e1265829a6c8ed755c3cd16d7f50e75a83456a5f3fd7", - "sha256:cca38ded59105f7705ef6ffe1e960b8db6c7d8279c1e71654a4775ab4454ca15", - "sha256:cf908840896f7aa62d0ec693beb53264b154f972eb8226fb864ac38975590c4f", - "sha256:d095a7b473f8a95f7efe821f92058c8a2ecfb18f8db6677ae3819e15dc11aaae", - "sha256:d22b4297e7e4225ccf01f1aa55e7a96412ea0796b532dd614c3fcbafa341128e", - "sha256:d4a2b578a7a70e0c71f662705262f87a456f1e6c1e40ada7ea699abaf070a76d", - "sha256:ddeb42a3d5419434742bf4cc71c9eaa22df3b76808e23a82bd0b0bd360f1a9f1", - "sha256:e65a5aa1670db6263f19fdc03daee1d7dbbadb5cb67fd0a1f16033659db13c1d", - "sha256:eaad65bd20955131bcdb3967a4dea66b4e4d4ca488efed7c00d91ee0173387e8", - "sha256:f45fba420b94165c17896861bb0e8b27fb7abdcedfeb154895d8553df90b7b00" - ], - "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version < '4' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'", - "version": "==5.0.2" + "sha256:15cf13a6896048d6d947bf7d222f36e4809ab926894beb748fc9caa14605d9c3", + "sha256:1daa3eceed220f9fdb80d5ff950dd95112cd27f70d004c7918ca6dfc6c47054c", + "sha256:1e44a022500d944d42f94df76727ba3fc0a5c0b672c358b61067abb88caee7a0", + "sha256:25dbf1110d70bab68a74b4b9d74f30e99b177cde3388e07cc7272f2168bd1477", + "sha256:3230d1003eec018ad4a472d254991e34241e0bbd513e97a29727c7c2f637bd2a", + "sha256:3dbb72eaeea5763676a1a1efd9b427a048c97c39ed92e13336e726117d0b72bf", + "sha256:5012d3b8d5a500834783689a5d2292fe06ec75dc86ee1ccdad04b6f5bf231691", + "sha256:51bc7710b13a2ae0c726f69756cf7ffd4362f4ac36546e243136187cfcc8aa73", + "sha256:527b4f316e6bf7755082a783726da20671a0cc388b786a64417780b90565b987", + "sha256:722e4557c8039aad9592c6a4213db75da08c2cd9945320220634f637251c3894", + "sha256:76e2057e8ffba5472fd28a3a010431fd9e928885ff480cb278877c6e9943cc2e", + "sha256:77afca04240c40450c331fa796b3eab6f1e15c5ecf8bf2b8bee9706cd5452fef", + "sha256:7afad9835e7a651d3551eab18cbc0fdb888f0a6136169fbef0662d9cdc9987cf", + "sha256:9bea19ac2f08672636350f203db89382121c9c2ade85d945953ef3c8cf9d2a68", + "sha256:a8b8ac7876bc3598e43e2603f772d2353d9931709345ad6c1149009fd1bc81b8", + "sha256:b0840b45187699affd4c6588286d429cd79a99d509fe3de0f209594669bb0954", + "sha256:b26aaf69713e5674efbde4d728fb7124e429c9466aeaf5f4a7e9e699b12c9fe2", + "sha256:b63dd43f455ba878e5e9f80ba4f748c0a2156dde6e0e6e690310e24d6e8caf40", + "sha256:be18f4ae5a9e46edae3f329de2191747966a34a3d93046dbdf897319923923bc", + "sha256:c312e57847db2526bc92b9bfa78266bfbaabac3fdcd751df4d062cd4c23e46dc", + "sha256:c60097190fe9dc2b329a0eb03393e2e0829156a589bd732e70794c0dd804258e", + "sha256:c62a2143e1313944bf4a5ab34fd3b4be15367a02e9478b0ce800cb510e3bbb9d", + "sha256:cc1109f54a14d940b8512ee9f1c3975c181bbb200306c6d8b87d93376538782f", + "sha256:cd60f507c125ac0ad83f05803063bed27e50fa903b9c2cfee3f8a6867ca600fc", + "sha256:d513cc3db248e566e07a0da99c230aca3556d9b09ed02f420664e2da97eac301", + "sha256:d649dc0bcace6fcdb446ae02b98798a856593b19b637c1b9af8edadf2b150bea", + "sha256:d7008a6796095a79544f4da1ee49418901961c97ca9e9d44904205ff7d6aa8cb", + "sha256:da93027835164b8223e8e5af2cf902a4c80ed93cb0909417234f4a9df3bcd9af", + "sha256:e69215621707119c6baf99bda014a45b999d37602cb7043d943c76a59b05bf52", + "sha256:ea9525e0fef2de9208250d6c5aeeee0138921057cd67fcef90fbed49c4d62d37", + "sha256:fca1669d464f0c9831fd10be2eef6b86f5ebd76c724d1e0706ebdff86bb4adf0" + ], + "version": "==5.0.3" }, "decorator": { "hashes": [ - "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce", - "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d" + "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760", + "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==4.4.1" + "version": "==4.4.2" }, "idna": { "hashes": [ @@ -765,11 +797,11 @@ }, "importlib-metadata": { "hashes": [ - "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45", - "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f" + "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302", + "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b" ], "markers": "python_version < '3.8'", - "version": "==1.3.0" + "version": "==1.5.0" }, "ipython": { "hashes": [ @@ -791,15 +823,14 @@ "sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1", "sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==4.3.21" }, "jedi": { "hashes": [ - "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064", - "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807" + "sha256:b4f4052551025c6b0b0b193b29a6ff7bdb74c52450631206c262aef9f7159ad2", + "sha256:d5c871cb9360b414f981e7072c52c33258d598305280fef91c6cae34739d65d5" ], - "version": "==0.15.2" + "version": "==0.16.0" }, "lazy-object-proxy": { "hashes": [ @@ -825,7 +856,6 @@ "sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4", "sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==1.4.3" }, "mccabe": { @@ -837,26 +867,24 @@ }, "more-itertools": { "hashes": [ - "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d", - "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564" + "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c", + "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507" ], - "markers": "python_version > '2.7'", - "version": "==8.0.2" + "version": "==8.2.0" }, "packaging": { "hashes": [ - "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb", - "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8" + "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3", + "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==20.0" + "version": "==20.3" }, "parso": { "hashes": [ - "sha256:55cf25df1a35fd88b878715874d2c4dc1ad3f0eebd1e0266a67e1f55efccfbe1", - "sha256:5c1f7791de6bd5dbbeac8db0ef5594b36799de198b3f7f7014643b0c5536b9d3" + "sha256:0c5659e0c6eba20636f99a04f469798dca8da279645ce5c387315b2c23912157", + "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995" ], - "version": "==0.5.2" + "version": "==0.6.2" }, "pathlib2": { "hashes": [ @@ -868,11 +896,11 @@ }, "pexpect": { "hashes": [ - "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", - "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" + "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937", + "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c" ], "markers": "sys_platform != 'win32'", - "version": "==4.7.0" + "version": "==4.8.0" }, "pickleshare": { "hashes": [ @@ -886,7 +914,6 @@ "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" ], - "markers": "python_version >= '3.5'", "version": "==0.13.1" }, "prompt-toolkit": { @@ -895,7 +922,6 @@ "sha256:dd4fca02c8069497ad931a2d09914c6b0d1b50151ce876bc15bde4c747090126", "sha256:f7eec66105baf40eda9ab026cd8b2e251337eea8d111196695d82e0c5f0af852" ], - "markers": "python_version != '3.1.*' and python_version >= '2.6' and python_version != '3.0.*'", "version": "==1.0.18" }, "ptyprocess": { @@ -910,16 +936,14 @@ "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa", "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "version": "==1.8.1" }, "pygments": { "hashes": [ - "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b", - "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe" + "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44", + "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324" ], - "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'", - "version": "==2.5.2" + "version": "==2.6.1" }, "pylint": { "hashes": [ @@ -934,16 +958,15 @@ "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f", "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec" ], - "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", "version": "==2.4.6" }, "pytest": { "hashes": [ - "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa", - "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4" + "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d", + "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6" ], "index": "ia", - "version": "==5.3.2" + "version": "==5.3.5" }, "pytest-cov": { "hashes": [ @@ -963,12 +986,11 @@ }, "pytest-pylint": { "hashes": [ - "sha256:8c38ea779e540e27ec4378b0820d906006e09f4ac834defbd886abbf57c7d2ec", - "sha256:a4f5e5007f88c2095dcac799e9f7eed3d7e7a2e657596e26093814980ff5fa20", - "sha256:a574c246535308f8f6ceac10fa82f8fffffa837071f7985b22515895185700c1" + "sha256:cac5d565182f39fbb7fa7f4ef1bbcc979e8f5cc260450ec72dc5aafeb782531f", + "sha256:dd3e232da5703e7fd14c610247dbe25dfd8e3278069b4b8bcf9778ba06b77569" ], "index": "ia", - "version": "==0.14.1" + "version": "==0.15.1" }, "pytest-pythonpath": { "hashes": [ @@ -979,19 +1001,19 @@ }, "requests": { "hashes": [ - "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", - "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" ], "index": "ia", - "version": "==2.22.0" + "version": "==2.23.0" }, "responses": { "hashes": [ - "sha256:515fd7c024097e5da76e9c4cf719083d181f1c3ddc09c2e0e49284ce863dd263", - "sha256:8ce8cb4e7e1ad89336f8865af152e0563d2e7f0e0b86d2cf75f015f819409243" + "sha256:0474ce3c897fbcc1aef286117c93499882d5c440f06a805947e4b1cb5ab3d474", + "sha256:f83613479a021e233e82d52ffb3e2e0e2836d24b0cc88a0fa31978789f78d0e5" ], "index": "ia", - "version": "==0.10.9" + "version": "==0.10.12" }, "simplegeneric": { "hashes": [ @@ -1001,52 +1023,50 @@ }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", - "version": "==1.13.0" + "version": "==1.14.0" }, "traitlets": { "hashes": [ "sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44", "sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7" ], - "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'", "version": "==4.3.3" }, "typed-ast": { "hashes": [ - "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", - "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", - "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", - "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", - "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", - "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", - "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", - "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", - "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", - "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", - "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", - "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", - "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", - "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", - "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", - "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", - "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", - "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", - "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", - "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" + "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", + "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", + "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", + "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", + "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", + "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", + "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", + "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", + "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", + "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", + "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", + "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", + "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", + "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", + "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", + "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", + "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", + "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", + "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", + "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", + "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" ], "markers": "implementation_name == 'cpython' and python_version < '3.8'", - "version": "==1.4.0" + "version": "==1.4.1" }, "urllib3": { "hashes": [ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" ], - "markers": "python_version >= '3.4'", "version": "==1.22" }, "wcwidth": { @@ -1064,11 +1084,11 @@ }, "zipp": { "hashes": [ - "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", - "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1", + "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921" ], - "markers": "python_version >= '3.5'", - "version": "==0.6.0" + "index": "ia", + "version": "==1.2.0" } } } diff --git a/python/ingest_file.py b/python/ingest_file.py index d4fdcac..f6f694e 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -17,7 +17,9 @@ def run_single_ingest(args): ) if args.force_recrawl: request['force_recrawl'] = True - ingester = IngestFileWorker() + ingester = IngestFileWorker( + try_spn2=not args.no_spn2, + ) result = ingester.process(request) print(json.dumps(result, sort_keys=True)) return result @@ -51,6 +53,9 @@ def main(): sub_single.add_argument('--force-recrawl', action='store_true', help="ignore GWB history and use SPNv2 to re-crawl") + sub_single.add_argument('--no-spn2', + action='store_true', + help="don't use live web (SPNv2)") sub_single.add_argument('--type', default="pdf", help="type of ingest (pdf, html, etc)") diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 3d49096..492b558 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,7 +1,7 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker -from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime +from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 7211ee0..5dc5b55 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -9,7 +9,7 @@ from collections import namedtuple from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult from sandcrawler.grobid import GrobidClient -from sandcrawler.misc import gen_file_metadata +from sandcrawler.misc import gen_file_metadata, clean_url from sandcrawler.html import extract_fulltext_url from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient @@ -224,7 +224,11 @@ class IngestFileWorker(SandcrawlerWorker): request['ingest_type'] = "pdf" assert request.get('ingest_type') == "pdf" ingest_type = request.get('ingest_type') - base_url = request['base_url'] + + # parse/clean URL + # note that we pass through the original/raw URL, and that is what gets + # persisted in database table + base_url = clean_url(request['base_url']) force_recrawl = bool(request.get('force_recrawl', False)) diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 88669e6..d9c9d55 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -6,8 +6,15 @@ import datetime import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error +import urlcanon +def clean_url(s): + parsed = urlcanon.parse_url(s) + if not parsed.port and parsed.colon_before_port: + parsed.colon_before_port = b'' + return str(urlcanon.whatwg(parsed)) + def gen_file_metadata(blob): """ Takes a file blob (bytestream) and returns hashes and other metadata. diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index fa46f10..03a1f29 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ This script is intended to be used for backfill ingest of old crawls. It can diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 563855d..494ec7a 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ This script is used to turn ingest request postgres rows (in JSON export diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index c51a152..2999574 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ Transform an unpaywall dump (JSON) into ingest requests. diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 420bc07..29f9e9f 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,7 +1,7 @@ import pytest -from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line +from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url def test_gen_file_metadata(): @@ -69,3 +69,9 @@ def test_invalid_cdx(): print("bad datetime") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" assert parse_cdx_line(raw) == None + +def test_clean_url(): + assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf" + assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \ + "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view" + diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index a27796b..688487f 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -37,6 +37,7 @@ CREATE TABLE IF NOT EXISTS file_meta ( size_bytes BIGINT, mimetype TEXT CHECK (octet_length(mimetype) >= 1) ); +CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex); CREATE TABLE IF NOT EXISTS fatcat_file ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md new file mode 100644 index 0000000..b46e6ec --- /dev/null +++ b/sql/monitoring_queries.md @@ -0,0 +1,94 @@ + +## fatcat-changelog pipeline + +Overall ingest status, past 3 days: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_file_result.updated >= NOW() - '3 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + +Broken domains, past 3 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '3 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + +Throughput per day, and success, for past month: + + SELECT ingest_request.ingest_type, + date(ingest_file_result.updated), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated) + ORDER BY date(ingest_file_result.updated) DESC; + +## fatcat-ingest + +Broken domains, past 7 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + +Throughput per day, and success, for past 7 days: + + SELECT ingest_request.ingest_type, + date(ingest_file_result.updated), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated) + ORDER BY date(ingest_file_result.updated) DESC; diff --git a/sql/random_queries.md b/sql/random_queries.md index d88f45b..572b4f9 100644 --- a/sql/random_queries.md +++ b/sql/random_queries.md @@ -117,9 +117,23 @@ Or: Can also do some quick lookups for a specific domain and protocol like: SELECT * - FROM ingest_file_result - WHERE terminal_url LIKE 'https://insights.ovid.com/%' - LIMIT 10; + FROM ingest_file_result + WHERE terminal_url LIKE 'https://insights.ovid.com/%' + LIMIT 10; + +For a given DOI prefix: + + SELECT * + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/a%' + AND status = 'no-pdf-link' + LIMIT 10; + + SELECT status, count(*) + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/%' + GROUP BY status + ORDER BY count(*) DESC; ## Bulk Ingest |