aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--notes/ingest/2020-02-14_unpaywall_ingest.md10
-rw-r--r--python/Pipfile4
-rw-r--r--python/Pipfile.lock438
-rwxr-xr-xpython/ingest_file.py7
-rwxr-xr-xpython/persist_tool.py4
-rw-r--r--python/sandcrawler/__init__.py2
-rw-r--r--python/sandcrawler/ia.py15
-rw-r--r--python/sandcrawler/ingest.py24
-rw-r--r--python/sandcrawler/misc.py7
-rw-r--r--python/sandcrawler/persist.py17
-rw-r--r--python/sandcrawler/workers.py6
-rwxr-xr-xpython/sandcrawler_worker.py4
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py2
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py2
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py2
-rw-r--r--python/tests/test_misc.py8
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql1
-rw-r--r--sql/monitoring_queries.md94
-rw-r--r--sql/random_queries.md20
19 files changed, 435 insertions, 232 deletions
diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02-14_unpaywall_ingest.md
index 0bedfdb..24779df 100644
--- a/notes/ingest/2020-02-14_unpaywall_ingest.md
+++ b/notes/ingest/2020-02-14_unpaywall_ingest.md
@@ -474,3 +474,13 @@ Note: will probably end up re-running the below after crawling+ingesting the abo
) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json';
=> 654,885
+## Batch Ingest
+
+Test small batch:
+
+ head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full batch:
+
+ cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/python/Pipfile b/python/Pipfile
index f7e59c5..fc63697 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -34,6 +34,10 @@ python-magic = "*"
ftfy = "*"
internetarchive = "*"
Flask = ">=1"
+urlcanon = "*"
+
+# this is only to lock to a python3.5-compatible version
+zipp = "<2.0.0"
[requires]
python_version = "3.5"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 4de99d5..07e7484 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "87b092df5b5472b2b42e8d4c42e73319968c36729cc1bd3fb6de0eb3346f2a6b"
+ "sha256": "ac2202b1be9e35ebd698dd609af8d868afb3fdd0134c5581925138b51d4f475f"
},
"pipfile-spec": 6,
"requires": {
@@ -21,6 +21,22 @@
]
},
"default": {
+ "aerospike": {
+ "hashes": [
+ "sha256:3c3edb9c59491100cf5f9b0d802ee0b812b32b626c16358133cf5b9931ab8620",
+ "sha256:42e6ed4f6298aab4e5094f45a69fc805f925fbaa4ec206a87ce0a2048df02d4d",
+ "sha256:67684fb6af531765eb6061e37597bc73a348a2eff141795447ab20d9c6a61289",
+ "sha256:6aec5e0dbedb8ddd97441abaebedb04d4abbd51bfcfd6f0a6722fabc5be4efd0",
+ "sha256:9280ecb0257b0b706df7ac934dc03f518641934479d9c925a46af5231fb65f40",
+ "sha256:98779725a86ef345b9fec0b5ef60b59b2430b9c8c8e8904adb7945af6d6f9ffb",
+ "sha256:99de79a26f184a47a67123899e093cecd5c3bc0b0ce92da4f302684ad0b0116c",
+ "sha256:b170b637d69f49c02d021477359866c3d89a2c0d1477bec19343828f890d3cb1",
+ "sha256:d2f0b0288e2efafb99bbada6b39714285f317dc47fde3c4458b76e8cfbb71c11",
+ "sha256:d83faa27d40af320058a93902e19173f6295acbcc9ca225c552d7648169859f0",
+ "sha256:db7efad41300cb9bd6e70534c3110fce4e474db6d4288428609e0214a021aab8"
+ ],
+ "version": "==3.10.0"
+ },
"args": {
"hashes": [
"sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814"
@@ -51,19 +67,18 @@
},
"boto3": {
"hashes": [
- "sha256:5c00d51101d6a7ddf2207ae8a738e5c815c5fcffbee76121f38bd41d83c936a5",
- "sha256:aa58c8de6aed36211e0897598de2a3d89122ad8cd1450165679720180ab880ef"
+ "sha256:5e145fa27b56c6f3db20c20d4e0084e146a400007064af8d46c1b3bfcc779c42",
+ "sha256:95d2fb21e3f0575f8b599706100c42a5a7ae576671f6f4beea6e42ab0f57166b"
],
"index": "ia",
- "version": "==1.10.50"
+ "version": "==1.12.18"
},
"botocore": {
"hashes": [
- "sha256:765a5c637ff792239727c327b221ed5a4d851e9f176ce8b8b9eca536425c74d4",
- "sha256:adb4cb188cd0866e7337f9a049fc68db042b0340fd496d40bca349c8dbfc6a2d"
+ "sha256:2f1a54e19531bdf2d953c2db5f76b49c6936565366b2503a528b04cbbf55671a",
+ "sha256:9827c6f3bffb9e316427c276e3f22e5d82b377dd43dcbbbe71ecbd9ac2b959ab"
],
- "markers": "python_version >= '2.6' and python_version != '3.1.*' and python_version != '3.0.*'",
- "version": "==1.13.50"
+ "version": "==1.15.18"
},
"brotli": {
"hashes": [
@@ -126,11 +141,10 @@
},
"click": {
"hashes": [
- "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
- "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
+ "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc",
+ "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a"
],
- "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
- "version": "==7.0"
+ "version": "==7.1.1"
},
"clint": {
"hashes": [
@@ -138,6 +152,13 @@
],
"version": "==0.5.1"
},
+ "configparser": {
+ "hashes": [
+ "sha256:254c1d9c79f60c45dfde850850883d5aaa7f19a23f13561243a050d5a7c3fe4c",
+ "sha256:c7d282687a5308319bf3d2e7706e575c635b0a470342641c93bea0ea3b5331df"
+ ],
+ "version": "==4.0.2"
+ },
"confluent-kafka": {
"hashes": [
"sha256:0efd716da4f03f99d45fbb0d1583c5c8bf1eabc258a883588e3cd6ee06c0facb",
@@ -187,31 +208,28 @@
"hashes": [
"sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea"
],
- "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'",
"version": "==0.1.4.8"
},
"dawg": {
"hashes": [
- "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5",
- "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953",
- "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e",
- "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8",
- "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3",
- "sha256:7d0a904e91adfa3de7071bfe64cd1334ce4040f1795cca8c13598bd075e72e18",
- "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55",
- "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9",
- "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171",
- "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9"
+ "sha256:28c4c934ab1ca74226a46e6213f919f5b0912bf9de87218264d4d94c15521753",
+ "sha256:34881e06278d4a54cf0b402c0c8b587bef0caa78f0eee595adc7a2aa530e48ce",
+ "sha256:73760ad1272b1b47997f1a768b8f3bf547c92475bcd62185f4ab7e1bc691964e",
+ "sha256:7aecc4c89243edaf1efe7a4d769d993a7cd9307a8a04f48e07c4fc7c44bdd38f",
+ "sha256:83ce4a73f7632b0ed31af16c2750533ecbed347bad1148a52f6436e348b5b7ac",
+ "sha256:d78929f5a7f7e083f5720992068535d133f0d3326f0c677c61c59256aa43d95e",
+ "sha256:e664a884ca48f2599ad5c2289d9b7f769e77d266560c79992e3db2cfce96cb1b",
+ "sha256:fb90b799fb7d6d728531840529c812a9ee17736da71e8a596ede8bfd6c62bf36",
+ "sha256:feb6073e0d02ac54389ad378e6c695e28fe579e2772c225a854299752effece6"
],
- "version": "==0.7.8"
+ "version": "==0.8.0"
},
"decorator": {
"hashes": [
- "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce",
- "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d"
+ "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
+ "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==4.4.1"
+ "version": "==4.4.2"
},
"docopt": {
"hashes": [
@@ -225,14 +243,12 @@
"sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827",
"sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99"
],
- "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6'",
"version": "==0.15.2"
},
"dogpile.cache": {
"hashes": [
"sha256:b348835825c9dcd251d9aad1f89f257277ac198a3e35a61980ab4cb28c75216b"
],
- "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'",
"version": "==0.9.0"
},
"elasticsearch": {
@@ -240,7 +256,6 @@
"sha256:1f0f633e3b500d5042424f75a505badf8c4b9962c1b4734cdfb3087fb67920be",
"sha256:fb5ab15ee283f104b5a7a5695c7e879cb2927e4eb5aed9c530811590b41259ad"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version < '4' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'",
"version": "==6.4.0"
},
"flask": {
@@ -253,23 +268,22 @@
},
"ftfy": {
"hashes": [
- "sha256:6d7509c45e602dec890f0f6ee0623a8b5f50ec1188ac7ab9535e18e572c99bcc"
+ "sha256:67f9c8b33a4b742376a3eda11b0e3bd5c0cbe719d95ea0bfd3736a7bdd1c24c8"
],
"index": "ia",
- "version": "==5.6"
+ "version": "==5.7"
},
"globalwayback": {
"hashes": [
- "sha256:ddd1fac7caad4181e8e623cb67ef3f6a6f7c0f306140c450b92f8bb3032aba51"
+ "sha256:6f7bd270ef827c1d8c1d5631c313dbf401e300993b593179699741fcdbc5295d"
],
"index": "ia",
- "version": "==0.4.13"
+ "version": "==0.5.2"
},
"ialib": {
"hashes": [
"sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc"
],
- "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
"version": "==0.3.0.1"
},
"idna": {
@@ -292,38 +306,34 @@
"sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
"sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
],
- "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
"version": "==1.1.0"
},
"jinja2": {
"hashes": [
- "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f",
- "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de"
+ "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250",
+ "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==2.10.3"
+ "version": "==2.11.1"
},
"jmespath": {
"hashes": [
- "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6",
- "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c"
+ "sha256:695cb76fa78a10663425d5b73ddc5714eb711157e52704d69be03b1a02ba4fec",
+ "sha256:cca55c8d153173e21baa59983015ad0daf603f9cb799904ff057bfb8ff8dc2d9"
],
- "version": "==0.9.4"
+ "version": "==0.9.5"
},
"jsonpatch": {
"hashes": [
- "sha256:83f29a2978c13da29bfdf89da9d65542d62576479caf215df19632d7dc04c6e6",
- "sha256:cbb72f8bf35260628aea6b508a107245f757d1ec839a19c34349985e2c05645a"
+ "sha256:cc3a7241010a1fd3f50145a3b33be2c03c1e679faa19934b628bb07d0f64819e",
+ "sha256:ddc0f7628b8bfdd62e3cbfbc24ca6671b0b6265b50d186c2cf3659dc0f78fd6a"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==1.24"
+ "version": "==1.25"
},
"jsonpointer": {
"hashes": [
"sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362",
"sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
"version": "==2.0"
},
"kazoo": {
@@ -331,7 +341,6 @@
"sha256:8db774f7bdece7d0dc7decb21539ff0852e42c2ffe1c28d7f1ff6f9292a1c3a4",
"sha256:a5fa2e400c5068cfee9e86b35cf0dab8232b574152d8e3590d823b3e2426ab5e"
],
- "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
"version": "==2.5.0"
},
"markupsafe": {
@@ -340,13 +349,16 @@
"sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
"sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
"sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
+ "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
"sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
"sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
"sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
"sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
"sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
"sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
+ "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
"sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
+ "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
"sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
"sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
"sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
@@ -363,42 +375,68 @@
"sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
"sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
"sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
- "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"
+ "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
+ "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
+ "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
"version": "==1.1.1"
},
"minio": {
"hashes": [
- "sha256:daef713dfaaa232a719c4dc58f0bed90b6d11ab14e94cd013cef7e7ed5b2cfdc",
- "sha256:e5dc4670fe5c3e0ef61e9556bbae4768fcb051f3deb73d89e40a1b470dc9bd88"
+ "sha256:7543f990231b0d605f35b9140ec51cdb6335e741bb8c45dea9b8746c248a54bf",
+ "sha256:bf9b5001273108864e09a330a0a1795c6caee6036a124a607916830544afbf5f",
+ "sha256:f1987811525120d68a420fc6142ca310df174123da56742233824342e3a8c3f8"
],
"index": "ia",
- "version": "==5.0.6"
+ "version": "==5.0.7"
},
"pillow": {
"hashes": [
+ "sha256:02d2ae1d7c311e6fa038abda0843683ae652c9292d723270c85deeb04a1001a8",
"sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b",
+ "sha256:1937c9e17f685fe6c360dd96ddb8f93f159ac721939ccbfc91a62d8124a29945",
+ "sha256:1ce73edaeb49af9ebeacfb8c58428ae39592839d3a7a16ef3926773f1c8ff8ee",
+ "sha256:2052f0372123c98497ee3294f4e20347d87b9f70d9c65ce2fc520b9339aa8465",
"sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe",
+ "sha256:26b4a2bcdf0e674505fcd2f1a882f29a99339ac3b5a8e7997b90ede2995434e2",
+ "sha256:282e069f92e43047b34bbd995a8800669af11d038db571758708ebcd96462964",
+ "sha256:2ee30463cbc8e60cba92722a3a64881a0d3df534a047e299a9bcf62ea34fd061",
"sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b",
"sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608",
"sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac",
"sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389",
+ "sha256:4756b77682c7335751a2cfc0e9e6d96945d88ffd315420d9010235021ddfc64c",
"sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff",
+ "sha256:502275e7a83872e62e0cd0be4da575a53a1f9703341aba814527dacdda3660a8",
"sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f",
+ "sha256:5bcea0df97fe0b911a6629aab0997b98e8811561c27167266758a7ede173123e",
"sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b",
+ "sha256:61cb7e1ff212293d74155397fad008b052bc9633efdf9caa4271c316cd25b99f",
+ "sha256:68e45ed91531d3b05a17a356fd4cb928603a00259916e057730a024b029d6b51",
+ "sha256:69c3cc797a66241ed2fa61ff6f52c73e7bd3e738d80d64abb3500e6fbdec30a3",
+ "sha256:69ec2f0effe8b395e55929bb4d9a3ca8ff0c40f85d61d00e1e5b1e504d28b5e2",
"sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3",
"sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05",
+ "sha256:82351254350d9212bccf71d387efae8ad8f6f4b904d095546a77852a6b16e05b",
+ "sha256:95236f64904157256254b6cc8e29feecd9ee6985732dcb36c9f58d7dabe081d0",
"sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64",
"sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7",
"sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842",
+ "sha256:9e274583a0eab0b6d227139146e28f74488cfbc0d262c4ba2e5c0998b9c498d2",
"sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a",
+ "sha256:a10befeb7b9975d7c3d2ca3eaf0cb505db98fe50874130e182c2a6f7a606591f",
"sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3",
+ "sha256:b55f62882d8db466fcf2228422bf3147617744888bf0cf6dffb3254a52eb316b",
+ "sha256:bf83901c158ad92e77e990f51531434e5a96c6aef805a84b6e3bfe825f4d4d0c",
+ "sha256:c32f99a0c7c5313b2df78399ef908563b319de23bec0cc89f1d04c37be19eb2d",
"sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984",
+ "sha256:cf4b3c634b317ee247c3add4375b0a6bdc45eb0c12a5d7fbf9bfd47ec10b020f",
"sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3",
"sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c",
+ "sha256:e04df3808d6202dd552c837c824796899c09ff0ff9c335607904e31f9d387110",
"sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb",
"sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0",
+ "sha256:eae3711a7916eb5ec800dfb6963da09db0ada63c0481639dd0ddc0b505883a02",
"sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c",
"sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394"
],
@@ -431,6 +469,7 @@
},
"pykafka": {
"hashes": [
+ "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459",
"sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577"
],
"index": "ia",
@@ -454,7 +493,6 @@
"sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
"sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
],
- "markers": "python_version >= '2.7'",
"version": "==2.8.1"
},
"python-magic": {
@@ -497,7 +535,6 @@
"sha256:e3a057b7a64f1222b56e47bcff5e4b94c4f61faac04c7c4ecb1985e18caa3994",
"sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615"
],
- "markers": "python_version != '3.2.*' and python_version != '3.4.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*'",
"version": "==5.3"
},
"raven": {
@@ -506,24 +543,22 @@
"sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
],
"index": "ia",
- "markers": null,
"version": "==6.10.0"
},
"redis": {
"hashes": [
- "sha256:3613daad9ce5951e426f460deddd5caf469e08a3af633e9578fc77d362becf62",
- "sha256:8d0fc278d3f5e1249967cba2eb4a5632d19e45ce5c09442b8422d15ee2c22cc2"
+ "sha256:0dcfb335921b88a850d461dc255ff4708294943322bd55de6cfd68972490ca1f",
+ "sha256:b205cffd05ebfd0a468db74f0eedbff8df1a7bfc47521516ade4692991bb0833"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==3.3.11"
+ "version": "==3.4.1"
},
"requests": {
"hashes": [
- "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
- "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
+ "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
],
"index": "ia",
- "version": "==2.22.0"
+ "version": "==2.23.0"
},
"requests-file": {
"hashes": [
@@ -540,11 +575,10 @@
},
"s3transfer": {
"hashes": [
- "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d",
- "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba"
+ "sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13",
+ "sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db"
],
- "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'",
- "version": "==0.2.1"
+ "version": "==0.3.3"
},
"schedule": {
"hashes": [
@@ -562,25 +596,23 @@
},
"six": {
"hashes": [
- "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd",
- "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"
+ "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
+ "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==1.13.0"
+ "version": "==1.14.0"
},
"soupsieve": {
"hashes": [
- "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5",
- "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda"
+ "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae",
+ "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69"
],
- "version": "==1.9.5"
+ "version": "==2.0"
},
"sqlalchemy": {
"hashes": [
- "sha256:bfb8f464a5000b567ac1d350b9090cf081180ec1ab4aa87e7bca12dab25320ec"
+ "sha256:b92d2de62e43499d85b1780274d1b562e5159c7996f6f04a9bb46cf681ced45f"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==1.3.12"
+ "version": "==1.3.14"
},
"surt": {
"hashes": [
@@ -600,7 +632,6 @@
"sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7",
"sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808"
],
- "markers": "python_version != '3.1.*' and python_version != '3.0.*' and python_version >= '2.6'",
"version": "==2.2.2"
},
"twitter": {
@@ -610,12 +641,18 @@
],
"version": "==1.18.0"
},
+ "urlcanon": {
+ "hashes": [
+ "sha256:30f5bf0e2e4a0feb6dd9ee139a4180a5d493117e8a1448569da3d73c18b92b62"
+ ],
+ "index": "ia",
+ "version": "==0.3.1"
+ },
"urllib3": {
"hashes": [
"sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
"sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
],
- "markers": "python_version >= '3.4'",
"version": "==1.22"
},
"warctools": {
@@ -626,10 +663,10 @@
},
"wayback": {
"hashes": [
- "sha256:3e89df1a3cb49baffe03572a77d00d97d54ccebeb4dd24f19d8f2b8ec3812ad3"
+ "sha256:936ae4c75af922e0f4b1bc82c66f51f97687cc6a1b8f3f1a19ec8fa7ab11ec41"
],
"index": "ia",
- "version": "==0.5.1"
+ "version": "==0.5.3"
},
"wayback-esp": {
"hashes": [
@@ -647,7 +684,6 @@
"hashes": [
"sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c"
],
- "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
"version": "==0.1.5"
},
"wcwidth": {
@@ -659,11 +695,18 @@
},
"werkzeug": {
"hashes": [
- "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7",
- "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4"
+ "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096",
+ "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*'",
- "version": "==0.16.0"
+ "version": "==1.0.0"
+ },
+ "zipp": {
+ "hashes": [
+ "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1",
+ "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"
+ ],
+ "index": "ia",
+ "version": "==1.2.0"
}
},
"develop": {
@@ -674,20 +717,11 @@
],
"version": "==2.3.3"
},
- "atomicwrites": {
- "hashes": [
- "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
- "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
- ],
- "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
- "version": "==1.3.0"
- },
"attrs": {
"hashes": [
"sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
"sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"version": "==19.3.0"
},
"backcall": {
@@ -713,48 +747,46 @@
},
"coverage": {
"hashes": [
- "sha256:189aac76d6e0d7af15572c51892e7326ee451c076c5a50a9d266406cd6c49708",
- "sha256:1bf7ba2af1d373a1750888724f84cffdfc697738f29a353c98195f98fc011509",
- "sha256:1f4ee8e2e4243971618bc16fcc4478317405205f135e95226c2496e2a3b8dbbf",
- "sha256:225e79a5d485bc1642cb7ba02281419c633c216cdc6b26c26494ba959f09e69f",
- "sha256:23688ff75adfa8bfa2a67254d889f9bdf9302c27241d746e17547c42c732d3f4",
- "sha256:28f7f73b34a05e23758e860a89a7f649b85c6749e252eff60ebb05532d180e86",
- "sha256:2d0cb9b1fe6ad0d915d45ad3d87f03a38e979093a98597e755930db1f897afae",
- "sha256:47874b4711c5aeb295c31b228a758ce3d096be83dc37bd56da48ed99efb8813b",
- "sha256:511ec0c00840e12fb4e852e4db58fa6a01ca4da72f36a9766fae344c3d502033",
- "sha256:53e7438fef0c97bc248f88ba1edd10268cd94d5609970aaf87abbe493691af87",
- "sha256:569f9ee3025682afda6e9b0f5bb14897c0db03f1a1dc088b083dd36e743f92bb",
- "sha256:593853aa1ac6dcc6405324d877544c596c9d948ef20d2e9512a0f5d2d3202356",
- "sha256:5b0a07158360d22492f9abd02a0f2ee7981b33f0646bf796598b7673f6bbab14",
- "sha256:7ca3db38a61f3655a2613ee2c190d63639215a7a736d3c64cc7bbdb002ce6310",
- "sha256:7d1cc7acc9ce55179616cf72154f9e648136ea55987edf84addbcd9886ffeba2",
- "sha256:88b51153657612aea68fa684a5b88037597925260392b7bb4509d4f9b0bdd889",
- "sha256:955ec084f549128fa2702f0b2dc696392001d986b71acd8fd47424f28289a9c3",
- "sha256:b251c7092cbb6d789d62dc9c9e7c4fb448c9138b51285c36aeb72462cad3600e",
- "sha256:bd82b684bb498c60ef47bb1541a50e6d006dde8579934dcbdbc61d67d1ea70d9",
- "sha256:bfe102659e2ec13b86c7f3b1db6c9a4e7beea4255058d006351339e6b342d5d2",
- "sha256:c1e4e39e43057396a5e9d069bfbb6ffeee892e40c5d2effbd8cd71f34ee66c4d",
- "sha256:cb2b74c123f65e8166f7e1265829a6c8ed755c3cd16d7f50e75a83456a5f3fd7",
- "sha256:cca38ded59105f7705ef6ffe1e960b8db6c7d8279c1e71654a4775ab4454ca15",
- "sha256:cf908840896f7aa62d0ec693beb53264b154f972eb8226fb864ac38975590c4f",
- "sha256:d095a7b473f8a95f7efe821f92058c8a2ecfb18f8db6677ae3819e15dc11aaae",
- "sha256:d22b4297e7e4225ccf01f1aa55e7a96412ea0796b532dd614c3fcbafa341128e",
- "sha256:d4a2b578a7a70e0c71f662705262f87a456f1e6c1e40ada7ea699abaf070a76d",
- "sha256:ddeb42a3d5419434742bf4cc71c9eaa22df3b76808e23a82bd0b0bd360f1a9f1",
- "sha256:e65a5aa1670db6263f19fdc03daee1d7dbbadb5cb67fd0a1f16033659db13c1d",
- "sha256:eaad65bd20955131bcdb3967a4dea66b4e4d4ca488efed7c00d91ee0173387e8",
- "sha256:f45fba420b94165c17896861bb0e8b27fb7abdcedfeb154895d8553df90b7b00"
- ],
- "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version < '4' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'",
- "version": "==5.0.2"
+ "sha256:15cf13a6896048d6d947bf7d222f36e4809ab926894beb748fc9caa14605d9c3",
+ "sha256:1daa3eceed220f9fdb80d5ff950dd95112cd27f70d004c7918ca6dfc6c47054c",
+ "sha256:1e44a022500d944d42f94df76727ba3fc0a5c0b672c358b61067abb88caee7a0",
+ "sha256:25dbf1110d70bab68a74b4b9d74f30e99b177cde3388e07cc7272f2168bd1477",
+ "sha256:3230d1003eec018ad4a472d254991e34241e0bbd513e97a29727c7c2f637bd2a",
+ "sha256:3dbb72eaeea5763676a1a1efd9b427a048c97c39ed92e13336e726117d0b72bf",
+ "sha256:5012d3b8d5a500834783689a5d2292fe06ec75dc86ee1ccdad04b6f5bf231691",
+ "sha256:51bc7710b13a2ae0c726f69756cf7ffd4362f4ac36546e243136187cfcc8aa73",
+ "sha256:527b4f316e6bf7755082a783726da20671a0cc388b786a64417780b90565b987",
+ "sha256:722e4557c8039aad9592c6a4213db75da08c2cd9945320220634f637251c3894",
+ "sha256:76e2057e8ffba5472fd28a3a010431fd9e928885ff480cb278877c6e9943cc2e",
+ "sha256:77afca04240c40450c331fa796b3eab6f1e15c5ecf8bf2b8bee9706cd5452fef",
+ "sha256:7afad9835e7a651d3551eab18cbc0fdb888f0a6136169fbef0662d9cdc9987cf",
+ "sha256:9bea19ac2f08672636350f203db89382121c9c2ade85d945953ef3c8cf9d2a68",
+ "sha256:a8b8ac7876bc3598e43e2603f772d2353d9931709345ad6c1149009fd1bc81b8",
+ "sha256:b0840b45187699affd4c6588286d429cd79a99d509fe3de0f209594669bb0954",
+ "sha256:b26aaf69713e5674efbde4d728fb7124e429c9466aeaf5f4a7e9e699b12c9fe2",
+ "sha256:b63dd43f455ba878e5e9f80ba4f748c0a2156dde6e0e6e690310e24d6e8caf40",
+ "sha256:be18f4ae5a9e46edae3f329de2191747966a34a3d93046dbdf897319923923bc",
+ "sha256:c312e57847db2526bc92b9bfa78266bfbaabac3fdcd751df4d062cd4c23e46dc",
+ "sha256:c60097190fe9dc2b329a0eb03393e2e0829156a589bd732e70794c0dd804258e",
+ "sha256:c62a2143e1313944bf4a5ab34fd3b4be15367a02e9478b0ce800cb510e3bbb9d",
+ "sha256:cc1109f54a14d940b8512ee9f1c3975c181bbb200306c6d8b87d93376538782f",
+ "sha256:cd60f507c125ac0ad83f05803063bed27e50fa903b9c2cfee3f8a6867ca600fc",
+ "sha256:d513cc3db248e566e07a0da99c230aca3556d9b09ed02f420664e2da97eac301",
+ "sha256:d649dc0bcace6fcdb446ae02b98798a856593b19b637c1b9af8edadf2b150bea",
+ "sha256:d7008a6796095a79544f4da1ee49418901961c97ca9e9d44904205ff7d6aa8cb",
+ "sha256:da93027835164b8223e8e5af2cf902a4c80ed93cb0909417234f4a9df3bcd9af",
+ "sha256:e69215621707119c6baf99bda014a45b999d37602cb7043d943c76a59b05bf52",
+ "sha256:ea9525e0fef2de9208250d6c5aeeee0138921057cd67fcef90fbed49c4d62d37",
+ "sha256:fca1669d464f0c9831fd10be2eef6b86f5ebd76c724d1e0706ebdff86bb4adf0"
+ ],
+ "version": "==5.0.3"
},
"decorator": {
"hashes": [
- "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce",
- "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d"
+ "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
+ "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==4.4.1"
+ "version": "==4.4.2"
},
"idna": {
"hashes": [
@@ -765,11 +797,11 @@
},
"importlib-metadata": {
"hashes": [
- "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45",
- "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"
+ "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302",
+ "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b"
],
"markers": "python_version < '3.8'",
- "version": "==1.3.0"
+ "version": "==1.5.0"
},
"ipython": {
"hashes": [
@@ -791,15 +823,14 @@
"sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1",
"sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"version": "==4.3.21"
},
"jedi": {
"hashes": [
- "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064",
- "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807"
+ "sha256:b4f4052551025c6b0b0b193b29a6ff7bdb74c52450631206c262aef9f7159ad2",
+ "sha256:d5c871cb9360b414f981e7072c52c33258d598305280fef91c6cae34739d65d5"
],
- "version": "==0.15.2"
+ "version": "==0.16.0"
},
"lazy-object-proxy": {
"hashes": [
@@ -825,7 +856,6 @@
"sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4",
"sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"version": "==1.4.3"
},
"mccabe": {
@@ -837,26 +867,24 @@
},
"more-itertools": {
"hashes": [
- "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d",
- "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564"
+ "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c",
+ "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507"
],
- "markers": "python_version > '2.7'",
- "version": "==8.0.2"
+ "version": "==8.2.0"
},
"packaging": {
"hashes": [
- "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb",
- "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8"
+ "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3",
+ "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==20.0"
+ "version": "==20.3"
},
"parso": {
"hashes": [
- "sha256:55cf25df1a35fd88b878715874d2c4dc1ad3f0eebd1e0266a67e1f55efccfbe1",
- "sha256:5c1f7791de6bd5dbbeac8db0ef5594b36799de198b3f7f7014643b0c5536b9d3"
+ "sha256:0c5659e0c6eba20636f99a04f469798dca8da279645ce5c387315b2c23912157",
+ "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995"
],
- "version": "==0.5.2"
+ "version": "==0.6.2"
},
"pathlib2": {
"hashes": [
@@ -868,11 +896,11 @@
},
"pexpect": {
"hashes": [
- "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
- "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+ "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937",
+ "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"
],
"markers": "sys_platform != 'win32'",
- "version": "==4.7.0"
+ "version": "==4.8.0"
},
"pickleshare": {
"hashes": [
@@ -886,7 +914,6 @@
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
],
- "markers": "python_version >= '3.5'",
"version": "==0.13.1"
},
"prompt-toolkit": {
@@ -895,7 +922,6 @@
"sha256:dd4fca02c8069497ad931a2d09914c6b0d1b50151ce876bc15bde4c747090126",
"sha256:f7eec66105baf40eda9ab026cd8b2e251337eea8d111196695d82e0c5f0af852"
],
- "markers": "python_version != '3.1.*' and python_version >= '2.6' and python_version != '3.0.*'",
"version": "==1.0.18"
},
"ptyprocess": {
@@ -910,16 +936,14 @@
"sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa",
"sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"version": "==1.8.1"
},
"pygments": {
"hashes": [
- "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b",
- "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe"
+ "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44",
+ "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324"
],
- "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.4.*' and python_version >= '2.7'",
- "version": "==2.5.2"
+ "version": "==2.6.1"
},
"pylint": {
"hashes": [
@@ -934,16 +958,15 @@
"sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f",
"sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"
],
- "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
"version": "==2.4.6"
},
"pytest": {
"hashes": [
- "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa",
- "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4"
+ "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d",
+ "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"
],
"index": "ia",
- "version": "==5.3.2"
+ "version": "==5.3.5"
},
"pytest-cov": {
"hashes": [
@@ -963,12 +986,11 @@
},
"pytest-pylint": {
"hashes": [
- "sha256:8c38ea779e540e27ec4378b0820d906006e09f4ac834defbd886abbf57c7d2ec",
- "sha256:a4f5e5007f88c2095dcac799e9f7eed3d7e7a2e657596e26093814980ff5fa20",
- "sha256:a574c246535308f8f6ceac10fa82f8fffffa837071f7985b22515895185700c1"
+ "sha256:cac5d565182f39fbb7fa7f4ef1bbcc979e8f5cc260450ec72dc5aafeb782531f",
+ "sha256:dd3e232da5703e7fd14c610247dbe25dfd8e3278069b4b8bcf9778ba06b77569"
],
"index": "ia",
- "version": "==0.14.1"
+ "version": "==0.15.1"
},
"pytest-pythonpath": {
"hashes": [
@@ -979,19 +1001,19 @@
},
"requests": {
"hashes": [
- "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
- "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
+ "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
],
"index": "ia",
- "version": "==2.22.0"
+ "version": "==2.23.0"
},
"responses": {
"hashes": [
- "sha256:515fd7c024097e5da76e9c4cf719083d181f1c3ddc09c2e0e49284ce863dd263",
- "sha256:8ce8cb4e7e1ad89336f8865af152e0563d2e7f0e0b86d2cf75f015f819409243"
+ "sha256:0474ce3c897fbcc1aef286117c93499882d5c440f06a805947e4b1cb5ab3d474",
+ "sha256:f83613479a021e233e82d52ffb3e2e0e2836d24b0cc88a0fa31978789f78d0e5"
],
"index": "ia",
- "version": "==0.10.9"
+ "version": "==0.10.12"
},
"simplegeneric": {
"hashes": [
@@ -1001,52 +1023,50 @@
},
"six": {
"hashes": [
- "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd",
- "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"
+ "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
+ "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
- "version": "==1.13.0"
+ "version": "==1.14.0"
},
"traitlets": {
"hashes": [
"sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44",
"sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7"
],
- "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
"version": "==4.3.3"
},
"typed-ast": {
"hashes": [
- "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161",
- "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e",
- "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e",
- "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0",
- "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c",
- "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47",
- "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631",
- "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4",
- "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34",
- "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b",
- "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2",
- "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e",
- "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a",
- "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233",
- "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1",
- "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36",
- "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d",
- "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a",
- "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66",
- "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12"
+ "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
+ "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
+ "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
+ "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
+ "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
+ "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
+ "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
+ "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
+ "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
+ "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
+ "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
+ "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
+ "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
+ "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
+ "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
+ "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
+ "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
+ "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
+ "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
+ "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
+ "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
],
"markers": "implementation_name == 'cpython' and python_version < '3.8'",
- "version": "==1.4.0"
+ "version": "==1.4.1"
},
"urllib3": {
"hashes": [
"sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
"sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
],
- "markers": "python_version >= '3.4'",
"version": "==1.22"
},
"wcwidth": {
@@ -1064,11 +1084,11 @@
},
"zipp": {
"hashes": [
- "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e",
- "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335"
+ "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1",
+ "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"
],
- "markers": "python_version >= '3.5'",
- "version": "==0.6.0"
+ "index": "ia",
+ "version": "==1.2.0"
}
}
}
diff --git a/python/ingest_file.py b/python/ingest_file.py
index d4fdcac..f6f694e 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -17,7 +17,9 @@ def run_single_ingest(args):
)
if args.force_recrawl:
request['force_recrawl'] = True
- ingester = IngestFileWorker()
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ )
result = ingester.process(request)
print(json.dumps(result, sort_keys=True))
return result
@@ -51,6 +53,9 @@ def main():
sub_single.add_argument('--force-recrawl',
action='store_true',
help="ignore GWB history and use SPNv2 to re-crawl")
+ sub_single.add_argument('--no-spn2',
+ action='store_true',
+ help="don't use live web (SPNv2)")
sub_single.add_argument('--type',
default="pdf",
help="type of ingest (pdf, html, etc)")
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 19e6dd7..869af06 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -41,6 +41,7 @@ def run_grobid(args):
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
s3_only=args.s3_only,
+ db_only=args.db_only,
)
pusher = JsonLinePusher(
worker,
@@ -135,6 +136,9 @@ def main():
sub_grobid.add_argument('--s3-only',
action='store_true',
help="only upload TEI-XML to S3 (don't write to database)")
+ sub_grobid.add_argument('--db-only',
+ action='store_true',
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
sub_grobid_disk = subparsers.add_parser('grobid-disk',
help="dump GRBOID output to (local) files on disk")
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 3d49096..492b558 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,7 +1,7 @@
from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
+from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
from .ingest import IngestFileWorker
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 99e92be..25697be 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -575,6 +575,7 @@ class WaybackClient:
next_url = start_url
urls_seen = [start_url]
for i in range(self.max_redirects):
+ print(" URL: {}".format(next_url), file=sys.stderr)
cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype)
#print(cdx_row, file=sys.stderr)
if not cdx_row:
@@ -659,7 +660,12 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
- next_url = resource.location
+ if resource.location.startswith('/'):
+ # redirect location does not include hostname
+ domain_prefix = '/'.join(next_url.split('/')[:3])
+ next_url = domain_prefix + resource.location
+ else:
+ next_url = resource.location
else:
next_url = self.fetch_replay_redirect(
url=cdx_row.url,
@@ -753,7 +759,7 @@ class SavePageNowClient:
self.poll_count = 60
self.poll_seconds = 3.0
- def save_url_now_v2(self, request_url):
+ def save_url_now_v2(self, request_url, force_get=0):
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -792,6 +798,7 @@ class SavePageNowClient:
'capture_all': 1,
'capture_screenshot': 0,
'if_not_archived_within': '1d',
+ 'force_get': force_get,
},
)
if resp.status_code == 429:
@@ -861,14 +868,14 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self, start_url, wayback_client):
+ def crawl_resource(self, start_url, wayback_client, force_get=0):
"""
Runs a SPN2 crawl, then fetches body from wayback.
TODO: possible to fetch from petabox?
"""
- spn_result = self.save_url_now_v2(start_url)
+ spn_result = self.save_url_now_v2(start_url, force_get=force_get)
if not spn_result.success:
status = spn_result.status
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 7211ee0..c9a697c 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -9,7 +9,7 @@ from collections import namedtuple
from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
from sandcrawler.grobid import GrobidClient
-from sandcrawler.misc import gen_file_metadata
+from sandcrawler.misc import gen_file_metadata, clean_url
from sandcrawler.html import extract_fulltext_url
from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
@@ -93,6 +93,15 @@ class IngestFileWorker(SandcrawlerWorker):
"digital.ucd.ie/", # ireland national historical
]
+ # these are special-case web domains for which we want SPN2 to not run
+ # a headless browser (brozzler), but instead simply run wget.
+ # the motivation could be to work around browser issues, or in the
+ # future possibly to increase download efficiency (wget/fetch being
+ # faster than browser fetch)
+ self.spn2_simple_get_domains = [
+ ]
+
+
def check_existing_ingest(self, base_url):
"""
Check in sandcrawler-db (postgres) to see if we have already ingested
@@ -138,7 +147,12 @@ class IngestFileWorker(SandcrawlerWorker):
if self.try_spn2 and (not resource or not resource.hit or soft404):
via = "spn2"
- resource = self.spn_client.crawl_resource(url, self.wayback_client)
+ force_get = 0
+ for domain in self.spn2_simple_get_domains:
+ if domain in url:
+ force_get = 1
+ break
+ resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get)
print("[FETCH {}\t] {}\t{}".format(
via,
resource.status,
@@ -224,7 +238,11 @@ class IngestFileWorker(SandcrawlerWorker):
request['ingest_type'] = "pdf"
assert request.get('ingest_type') == "pdf"
ingest_type = request.get('ingest_type')
- base_url = request['base_url']
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request['base_url'])
force_recrawl = bool(request.get('force_recrawl', False))
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 88669e6..d9c9d55 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -6,8 +6,15 @@ import datetime
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+import urlcanon
+def clean_url(s):
+ parsed = urlcanon.parse_url(s)
+ if not parsed.port and parsed.colon_before_port:
+ parsed.colon_before_port = b''
+ return str(urlcanon.whatwg(parsed))
+
def gen_file_metadata(blob):
"""
Takes a file blob (bytestream) and returns hashes and other metadata.
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index f5de44a..379fd8b 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -240,6 +240,8 @@ class PersistGrobidWorker(SandcrawlerWorker):
default_bucket=kwargs['s3_bucket'],
)
self.s3_only = kwargs.get('s3_only', False)
+ self.db_only = kwargs.get('db_only', False)
+ assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
def process(self, record):
"""
@@ -264,13 +266,14 @@ class PersistGrobidWorker(SandcrawlerWorker):
continue
assert len(r['key']) == 40
- resp = self.s3.put_blob(
- folder="grobid",
- blob=r['tei_xml'],
- sha1hex=r['key'],
- extension=".tei.xml",
- )
- self.counts['s3-put'] += 1
+ if not self.db_only:
+ resp = self.s3.put_blob(
+ folder="grobid",
+ blob=r['tei_xml'],
+ sha1hex=r['key'],
+ extension=".tei.xml",
+ )
+ self.counts['s3-put'] += 1
# enhance with teixml2json metadata, if available
try:
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index c290421..54bd581 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -52,6 +52,12 @@ class SandcrawlerWorker(object):
print("Worker: {}".format(self.counts), file=sys.stderr)
return self.counts
+ def process(self, task):
+ """
+ Derived workers need to implement business logic here.
+ """
+ raise NotImplementedError('implementation required')
+
class MultiprocessWrapper(SandcrawlerWorker):
def __init__(self, worker, sink, jobs=None):
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 0ba4d03..5720f48 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -58,6 +58,7 @@ def run_persist_grobid(args):
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
s3_only=args.s3_only,
+ db_only=args.db_only,
)
pusher = KafkaJsonPusher(
worker=worker,
@@ -173,6 +174,9 @@ def main():
sub_persist_grobid.add_argument('--s3-only',
action='store_true',
help="only upload TEI-XML to S3 (don't write to database)")
+ sub_persist_grobid.add_argument('--db-only',
+ action='store_true',
+ help="only write status to database (don't upload TEI-XML to S3)")
sub_persist_grobid.set_defaults(func=run_persist_grobid)
sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index fa46f10..03a1f29 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
"""
This script is intended to be used for backfill ingest of old crawls. It can
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 563855d..494ec7a 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
"""
This script is used to turn ingest request postgres rows (in JSON export
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index c51a152..2999574 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
"""
Transform an unpaywall dump (JSON) into ingest requests.
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 420bc07..29f9e9f 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
import pytest
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line
+from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
def test_gen_file_metadata():
@@ -69,3 +69,9 @@ def test_invalid_cdx():
print("bad datetime")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
assert parse_cdx_line(raw) == None
+
+def test_clean_url():
+ assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
+ assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
+ "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index a27796b..688487f 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -37,6 +37,7 @@ CREATE TABLE IF NOT EXISTS file_meta (
size_bytes BIGINT,
mimetype TEXT CHECK (octet_length(mimetype) >= 1)
);
+CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex);
CREATE TABLE IF NOT EXISTS fatcat_file (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md
new file mode 100644
index 0000000..35eef8f
--- /dev/null
+++ b/sql/monitoring_queries.md
@@ -0,0 +1,94 @@
+
+## fatcat-changelog pipeline
+
+Overall ingest status, past 3 days:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '3 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+Broken domains, past 3 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '3 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Throughput per day, and success, for past month:
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_request.created),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated)
+ ORDER BY date(ingest_file_result.updated) DESC;
+
+## fatcat-ingest
+
+Broken domains, past 7 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Throughput per day, and success, for past 7 days:
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_file_result.updated),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated)
+ ORDER BY date(ingest_file_result.updated) DESC;
diff --git a/sql/random_queries.md b/sql/random_queries.md
index d88f45b..572b4f9 100644
--- a/sql/random_queries.md
+++ b/sql/random_queries.md
@@ -117,9 +117,23 @@ Or:
Can also do some quick lookups for a specific domain and protocol like:
SELECT *
- FROM ingest_file_result
- WHERE terminal_url LIKE 'https://insights.ovid.com/%'
- LIMIT 10;
+ FROM ingest_file_result
+ WHERE terminal_url LIKE 'https://insights.ovid.com/%'
+ LIMIT 10;
+
+For a given DOI prefix:
+
+ SELECT *
+ FROM ingest_file_result
+ WHERE base_url LIKE 'https://doi.org/10.17223/a%'
+ AND status = 'no-pdf-link'
+ LIMIT 10;
+
+ SELECT status, count(*)
+ FROM ingest_file_result
+ WHERE base_url LIKE 'https://doi.org/10.17223/%'
+ GROUP BY status
+ ORDER BY count(*) DESC;
## Bulk Ingest