From e590eec544ab6f2e54e8770f01e64eef3158fdaa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 19 Nov 2018 23:04:18 -0800 Subject: initial OAI-PMH harvesters --- python/Pipfile | 1 + python/Pipfile.lock | 320 ++++++++++++++++---------- python/fatcat_harvest.py | 57 ++++- python/fatcat_tools/harvest/__init__.py | 2 + python/fatcat_tools/harvest/doi_registrars.py | 13 +- python/fatcat_tools/harvest/oaipmh.py | 157 +++++++++++++ 6 files changed, 417 insertions(+), 133 deletions(-) create mode 100644 python/fatcat_tools/harvest/oaipmh.py diff --git a/python/Pipfile b/python/Pipfile index f4137dca..04b5c5ef 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -31,6 +31,7 @@ flask-uuid = "*" flask-debugtoolbar = "*" pykafka = "*" python-dateutil = "*" +sickle = "*" [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/Pipfile.lock b/python/Pipfile.lock index da96a24f..fa597faa 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "c99945057fc87c7a825a76cfe4d1abdcb99d0e70dc71db770cf259b761c6835c" + "sha256": "f3c3ce52dc614cb4514b4d637732e705815403f9181c142aea59a7feaa51f355" }, "pipfile-spec": 6, "requires": { @@ -110,11 +110,73 @@ ], "version": "==2.5.0" }, + "lxml": { + "hashes": [ + "sha256:02bc220d61f46e9b9d5a53c361ef95e9f5e1d27171cd461dddb17677ae2289a5", + "sha256:22f253b542a342755f6cfc047fe4d3a296515cf9b542bc6e261af45a80b8caf6", + "sha256:2f31145c7ff665b330919bfa44aacd3a0211a76ca7e7b441039d2a0b0451e415", + "sha256:36720698c29e7a9626a0dc802ef8885f8f0239bfd1689628ecd459a061f2807f", + "sha256:438a1b0203545521f6616132bfe0f4bca86f8a401364008b30e2b26ec408ce85", + "sha256:4815892904c336bbaf73dafd54f45f69f4021c22b5bad7332176bbf4fb830568", + "sha256:5be031b0f15ad63910d8e5038b489d95a79929513b3634ad4babf77100602588", + "sha256:5c93ae37c3c588e829b037fdfbd64a6e40c901d3f93f7beed6d724c44829a3ad", + "sha256:60842230678674cdac4a1cf0f707ef12d75b9a4fc4a565add4f710b5fcf185d5", + "sha256:62939a8bb6758d1bf923aa1c13f0bcfa9bf5b2fc0f5fa917a6e25db5fe0cfa4e", + "sha256:75830c06a62fe7b8fe3bbb5f269f0b308f19f3949ac81cfd40062f47c1455faf", + "sha256:81992565b74332c7c1aff6a913a3e906771aa81c9d0c68c68113cffcae45bc53", + "sha256:8c892fb0ee52c594d9a7751c7d7356056a9682674b92cc1c4dc968ff0f30c52f", + "sha256:9d862e3cf4fc1f2837dedce9c42269c8c76d027e49820a548ac89fdcee1e361f", + "sha256:a623965c086a6e91bb703d4da62dabe59fe88888e82c4117d544e11fd74835d6", + "sha256:a7783ab7f6a508b0510490cef9f857b763d796ba7476d9703f89722928d1e113", + "sha256:aab09fbe8abfa3b9ce62aaf45aca2d28726b1b9ee44871dbe644050a2fff4940", + "sha256:abf181934ac3ef193832fb973fd7f6149b5c531903c2ec0f1220941d73eee601", + "sha256:ae07fa0c115733fce1e9da96a3ac3fa24801742ca17e917e0c79d63a01eeb843", + "sha256:b9c78242219f674ab645ec571c9a95d70f381319a23911941cd2358a8e0521cf", + "sha256:bccb267678b870d9782c3b44d0cefe3ba0e329f9af8c946d32bf3778e7a4f271", + "sha256:c4df4d27f4c93b2cef74579f00b1d3a31a929c7d8023f870c4b476f03a274db4", + "sha256:caf0e50b546bb60dfa99bb18dfa6748458a83131ecdceaf5c071d74907e7e78a", + "sha256:d3266bd3ac59ac4edcd5fa75165dee80b94a3e5c91049df5f7c057ccf097551c", + "sha256:db0d213987bcd4e6d41710fb4532b22315b0d8fb439ff901782234456556aed1", + "sha256:dbbd5cf7690a40a9f0a9325ab480d0fccf46d16b378eefc08e195d84299bfae1", + "sha256:e16e07a0ec3a75b5ee61f2b1003c35696738f937dc8148fbda9fe2147ccb6e61", + "sha256:e175a006725c7faadbe69e791877d09936c0ef2cf49d01b60a6c1efcb0e8be6f", + "sha256:edd9c13a97f6550f9da2236126bb51c092b3b1ce6187f2bd966533ad794bbb5e", + "sha256:fa39ea60d527fbdd94215b5e5552f1c6a912624521093f1384a491a8ad89ad8b" + ], + "version": "==4.2.5" + }, "markupsafe": { "hashes": [ - "sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665" + "sha256:048ef924c1623740e70204aa7143ec592504045ae4429b59c30054cb31e3c432", + "sha256:130f844e7f5bdd8e9f3f42e7102ef1d49b2e6fdf0d7526df3f87281a532d8c8b", + "sha256:19f637c2ac5ae9da8bfd98cef74d64b7e1bb8a63038a3505cd182c3fac5eb4d9", + "sha256:1b8a7a87ad1b92bd887568ce54b23565f3fd7018c4180136e1cf412b405a47af", + "sha256:1c25694ca680b6919de53a4bb3bdd0602beafc63ff001fea2f2fc16ec3a11834", + "sha256:1f19ef5d3908110e1e891deefb5586aae1b49a7440db952454b4e281b41620cd", + "sha256:1fa6058938190ebe8290e5cae6c351e14e7bb44505c4a7624555ce57fbbeba0d", + "sha256:31cbb1359e8c25f9f48e156e59e2eaad51cd5242c05ed18a8de6dbe85184e4b7", + "sha256:3e835d8841ae7863f64e40e19477f7eb398674da6a47f09871673742531e6f4b", + "sha256:4e97332c9ce444b0c2c38dd22ddc61c743eb208d916e4265a2a3b575bdccb1d3", + "sha256:525396ee324ee2da82919f2ee9c9e73b012f23e7640131dd1b53a90206a0f09c", + "sha256:52b07fbc32032c21ad4ab060fec137b76eb804c4b9a1c7c7dc562549306afad2", + "sha256:52ccb45e77a1085ec5461cde794e1aa037df79f473cbc69b974e73940655c8d7", + "sha256:5c3fbebd7de20ce93103cb3183b47671f2885307df4a17a0ad56a1dd51273d36", + "sha256:5e5851969aea17660e55f6a3be00037a25b96a9b44d2083651812c99d53b14d1", + "sha256:5edfa27b2d3eefa2210fb2f5d539fbed81722b49f083b2c6566455eb7422fd7e", + "sha256:7d263e5770efddf465a9e31b78362d84d015cc894ca2c131901a4445eaa61ee1", + "sha256:83381342bfc22b3c8c06f2dd93a505413888694302de25add756254beee8449c", + "sha256:857eebb2c1dc60e4219ec8e98dfa19553dae33608237e107db9c6078b1167856", + "sha256:98e439297f78fca3a6169fd330fbe88d78b3bb72f967ad9961bcac0d7fdd1550", + "sha256:bf54103892a83c64db58125b3f2a43df6d2cb2d28889f14c78519394feb41492", + "sha256:d9ac82be533394d341b41d78aca7ed0e0f4ba5a2231602e2f05aa87f25c51672", + "sha256:e982fe07ede9fada6ff6705af70514a52beb1b2c3d25d4e873e82114cf3c5401", + "sha256:edce2ea7f3dfc981c4ddc97add8a61381d9642dc3273737e756517cc03e84dd6", + "sha256:efdc45ef1afc238db84cb4963aa689c0408912a0239b0721cb172b4016eb31d6", + "sha256:f137c02498f8b935892d5c0172560d7ab54bc45039de8805075e19079c639a9c", + "sha256:f82e347a72f955b7017a39708a3667f106e6ad4d10b25f237396a7115d8ed5fd", + "sha256:fb7c206e01ad85ce57feeaaa0bf784b97fa3cad0d4a5737bc5295785f5c613a1" ], - "version": "==1.0" + "version": "==1.1.0" }, "marshmallow": { "hashes": [ @@ -125,47 +187,47 @@ }, "marshmallow-sqlalchemy": { "hashes": [ - "sha256:a42cdbd6b623059fca601e1b572cab28f00d4acf36e2cef38094c88424b3dcf1", - "sha256:aacb0a7e0f6b5d489cdb3c10d1ab420f74c21538838026337738e4c6e8848fd8" + "sha256:1a4813bbcd2a34f10b1fcad5f4ed85355739f39edb223e6cf68a95bd75807885", + "sha256:5fc53b6fac10c3e0d0c3e1ba19312860b54534ffc56bc5d9615bf680f35a18de" ], "index": "pypi", - "version": "==0.14.1" + "version": "==0.15.0" }, "psycopg2": { "hashes": [ - "sha256:0b9e48a1c1505699a64ac58815ca99104aacace8321e455072cee4f7fe7b2698", - "sha256:0f4c784e1b5a320efb434c66a50b8dd7e30a7dc047e8f45c0a8d2694bfe72781", - "sha256:0fdbaa32c9eb09ef09d425dc154628fca6fa69d2f7c1a33f889abb7e0efb3909", - "sha256:11fbf688d5c953c0a5ba625cc42dea9aeb2321942c7c5ed9341a68f865dc8cb1", - "sha256:19eaac4eb25ab078bd0f28304a0cb08702d120caadfe76bb1e6846ed1f68635e", - "sha256:3232ec1a3bf4dba97fbf9b03ce12e4b6c1d01ea3c85773903a67ced725728232", - "sha256:36f8f9c216fcca048006f6dd60e4d3e6f406afde26cfb99e063f137070139eaf", - "sha256:59c1a0e4f9abe970062ed35d0720935197800a7ef7a62b3a9e3a70588d9ca40b", - "sha256:6506c5ff88750948c28d41852c09c5d2a49f51f28c6d90cbf1b6808e18c64e88", - "sha256:6bc3e68ee16f571681b8c0b6d5c0a77bef3c589012352b3f0cf5520e674e9d01", - "sha256:6dbbd7aabbc861eec6b910522534894d9dbb507d5819bc982032c3ea2e974f51", - "sha256:6e737915de826650d1a5f7ff4ac6cf888a26f021a647390ca7bafdba0e85462b", - "sha256:6ed9b2cfe85abc720e8943c1808eeffd41daa73e18b7c1e1a228b0b91f768ccc", - "sha256:711ec617ba453fdfc66616db2520db3a6d9a891e3bf62ef9aba4c95bb4e61230", - "sha256:844dacdf7530c5c612718cf12bc001f59b2d9329d35b495f1ff25045161aa6af", - "sha256:86b52e146da13c896e50c5a3341a9448151f1092b1a4153e425d1e8b62fec508", - "sha256:985c06c2a0f227131733ae58d6a541a5bc8b665e7305494782bebdb74202b793", - "sha256:a86dfe45f4f9c55b1a2312ff20a59b30da8d39c0e8821d00018372a2a177098f", - "sha256:aa3cd07f7f7e3183b63d48300666f920828a9dbd7d7ec53d450df2c4953687a9", - "sha256:b1964ed645ef8317806d615d9ff006c0dadc09dfc54b99ae67f9ba7a1ec9d5d2", - "sha256:b2abbff9e4141484bb89b96eb8eae186d77bc6d5ffbec6b01783ee5c3c467351", - "sha256:cc33c3a90492e21713260095f02b12bee02b8d1f2c03a221d763ce04fa90e2e9", - "sha256:d7de3bf0986d777807611c36e809b77a13bf1888f5c8db0ebf24b47a52d10726", - "sha256:db5e3c52576cc5b93a959a03ccc3b02cb8f0af1fbbdc80645f7a215f0b864f3a", - "sha256:e168aa795ffbb11379c942cf95bf813c7db9aa55538eb61de8c6815e092416f5", - "sha256:e9ca911f8e2d3117e5241d5fa9aaa991cb22fb0792627eeada47425d706b5ec8", - "sha256:eccf962d41ca46e6326b97c8fe0a6687b58dfc1a5f6540ed071ff1474cea749e", - "sha256:efa19deae6b9e504a74347fe5e25c2cb9343766c489c2ae921b05f37338b18d1", - "sha256:f4b0460a21f784abe17b496f66e74157a6c36116fa86da8bf6aa028b9e8ad5fe", - "sha256:f93d508ca64d924d478fb11e272e09524698f0c581d9032e68958cfbdd41faef" + "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e", + "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237", + "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f", + "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300", + "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076", + "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221", + "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82", + "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92", + "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3", + "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6", + "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e", + "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae", + "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535", + "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f", + "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26", + "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7", + "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53", + "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b", + "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc", + "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee", + "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20", + "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d", + "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9", + "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244", + "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f", + "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b", + "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370", + "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9", + "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d", + "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0" ], "index": "pypi", - "version": "==2.7.5" + "version": "==2.7.6.1" }, "pykafka": { "hashes": [ @@ -193,11 +255,19 @@ }, "requests": { "hashes": [ - "sha256:99dcfdaaeb17caf6e526f32b6a7b780461512ab3f1d992187801694cba42770c", - "sha256:a84b8c9ab6239b578f22d1c21d51b696dcfe004032bb80ea832398d6909d7279" + "sha256:65b3a120e4329e33c9889db89c80976c5272f56ea92d3e74da8a463992e3ff54", + "sha256:ea881206e59f41dbd0bd445437d792e43906703fff75ca8ff43ccdb11f33f263" ], "index": "pypi", - "version": "==2.20.0" + "version": "==2.20.1" + }, + "sickle": { + "hashes": [ + "sha256:76d66ed4607af2cd36ee15568a98e7f147d4ec3dd227bd047664a1ca88b21944", + "sha256:b0aaa41d97a0c355aa6099b4dfa46c03f0bf828e6171960a15d68bd0548215ec" + ], + "index": "pypi", + "version": "==0.6.4" }, "six": { "hashes": [ @@ -208,10 +278,10 @@ }, "sqlalchemy": { "hashes": [ - "sha256:84412de3794acee05630e7788f25e80e81f78eb4837e7b71d0499129f660486a" + "sha256:9de7c7dabcf06319becdb7e15099c44e5e34ba7062f9ba10bc00e562f5db3d04" ], "index": "pypi", - "version": "==1.2.13" + "version": "==1.2.14" }, "tabulate": { "hashes": [ @@ -279,49 +349,49 @@ }, "coverage": { "hashes": [ - "sha256:03481e81d558d30d230bc12999e3edffe392d244349a90f4ef9b88425fac74ba", - "sha256:0b136648de27201056c1869a6c0d4e23f464750fd9a9ba9750b8336a244429ed", - "sha256:0bf8cbbd71adfff0ef1f3a1531e6402d13b7b01ac50a79c97ca15f030dba6306", - "sha256:104ab3934abaf5be871a583541e8829d6c19ce7bde2923b2751e0d3ca44db60a", - "sha256:10a46017fef60e16694a30627319f38a2b9b52e90182dddb6e37dcdab0f4bf95", - "sha256:15b111b6a0f46ee1a485414a52a7ad1d703bdf984e9ed3c288a4414d3871dcbd", - "sha256:198626739a79b09fa0a2f06e083ffd12eb55449b5f8bfdbeed1df4910b2ca640", - "sha256:1c383d2ef13ade2acc636556fd544dba6e14fa30755f26812f54300e401f98f2", - "sha256:23d341cdd4a0371820eb2b0bd6b88f5003a7438bbedb33688cd33b8eae59affd", - "sha256:28b2191e7283f4f3568962e373b47ef7f0392993bb6660d079c62bd50fe9d162", - "sha256:2a5b73210bad5279ddb558d9a2bfedc7f4bf6ad7f3c988641d83c40293deaec1", - "sha256:2eb564bbf7816a9d68dd3369a510be3327f1c618d2357fa6b1216994c2e3d508", - "sha256:337ded681dd2ef9ca04ef5d93cfc87e52e09db2594c296b4a0a3662cb1b41249", - "sha256:3a2184c6d797a125dca8367878d3b9a178b6fdd05fdc2d35d758c3006a1cd694", - "sha256:3c79a6f7b95751cdebcd9037e4d06f8d5a9b60e4ed0cd231342aa8ad7124882a", - "sha256:3d72c20bd105022d29b14a7d628462ebdc61de2f303322c0212a054352f3b287", - "sha256:3eb42bf89a6be7deb64116dd1cc4b08171734d721e7a7e57ad64cc4ef29ed2f1", - "sha256:4635a184d0bbe537aa185a34193898eee409332a8ccb27eea36f262566585000", - "sha256:56e448f051a201c5ebbaa86a5efd0ca90d327204d8b059ab25ad0f35fbfd79f1", - "sha256:5a13ea7911ff5e1796b6d5e4fbbf6952381a611209b736d48e675c2756f3f74e", - "sha256:69bf008a06b76619d3c3f3b1983f5145c75a305a0fea513aca094cae5c40a8f5", - "sha256:6bc583dc18d5979dc0f6cec26a8603129de0304d5ae1f17e57a12834e7235062", - "sha256:701cd6093d63e6b8ad7009d8a92425428bc4d6e7ab8d75efbb665c806c1d79ba", - "sha256:7608a3dd5d73cb06c531b8925e0ef8d3de31fed2544a7de6c63960a1e73ea4bc", - "sha256:76ecd006d1d8f739430ec50cc872889af1f9c1b6b8f48e29941814b09b0fd3cc", - "sha256:7aa36d2b844a3e4a4b356708d79fd2c260281a7390d678a10b91ca595ddc9e99", - "sha256:7d3f553904b0c5c016d1dad058a7554c7ac4c91a789fca496e7d8347ad040653", - "sha256:7e1fe19bd6dce69d9fd159d8e4a80a8f52101380d5d3a4d374b6d3eae0e5de9c", - "sha256:8c3cb8c35ec4d9506979b4cf90ee9918bc2e49f84189d9bf5c36c0c1119c6558", - "sha256:9d6dd10d49e01571bf6e147d3b505141ffc093a06756c60b053a859cb2128b1f", - "sha256:9e112fcbe0148a6fa4f0a02e8d58e94470fc6cb82a5481618fea901699bf34c4", - "sha256:ac4fef68da01116a5c117eba4dd46f2e06847a497de5ed1d64bb99a5fda1ef91", - "sha256:b8815995e050764c8610dbc82641807d196927c3dbed207f0a079833ffcf588d", - "sha256:be6cfcd8053d13f5f5eeb284aa8a814220c3da1b0078fa859011c7fffd86dab9", - "sha256:c1bb572fab8208c400adaf06a8133ac0712179a334c09224fb11393e920abcdd", - "sha256:de4418dadaa1c01d497e539210cb6baa015965526ff5afc078c57ca69160108d", - "sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6", - "sha256:e4d96c07229f58cb686120f168276e434660e4358cc9cf3b0464210b04913e77", - "sha256:f05a636b4564104120111800021a92e43397bc12a5c72fed7036be8556e0029e", - "sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80", - "sha256:f8a923a85cb099422ad5a2e345fe877bbc89a8a8b23235824a93488150e45f6e" - ], - "version": "==4.5.1" + "sha256:06123b58a1410873e22134ca2d88bd36680479fe354955b3579fb8ff150e4d27", + "sha256:09e47c529ff77bf042ecfe858fb55c3e3eb97aac2c87f0349ab5a7efd6b3939f", + "sha256:0a1f9b0eb3aa15c990c328535655847b3420231af299386cfe5efc98f9c250fe", + "sha256:0cc941b37b8c2ececfed341444a456912e740ecf515d560de58b9a76562d966d", + "sha256:0d34245f824cc3140150ab7848d08b7e2ba67ada959d77619c986f2062e1f0e8", + "sha256:10e8af18d1315de936d67775d3a814cc81d0747a1a0312d84e27ae5610e313b0", + "sha256:1b4276550b86caa60606bd3572b52769860a81a70754a54acc8ba789ce74d607", + "sha256:1e8a2627c48266c7b813975335cfdea58c706fe36f607c97d9392e61502dc79d", + "sha256:258b21c5cafb0c3768861a6df3ab0cfb4d8b495eee5ec660e16f928bf7385390", + "sha256:2b224052bfd801beb7478b03e8a66f3f25ea56ea488922e98903914ac9ac930b", + "sha256:3ad59c84c502cd134b0088ca9038d100e8fb5081bbd5ccca4863f3804d81f61d", + "sha256:447c450a093766744ab53bf1e7063ec82866f27bcb4f4c907da25ad293bba7e3", + "sha256:46101fc20c6f6568561cdd15a54018bb42980954b79aa46da8ae6f008066a30e", + "sha256:4710dc676bb4b779c4361b54eb308bc84d64a2fa3d78e5f7228921eccce5d815", + "sha256:510986f9a280cd05189b42eee2b69fecdf5bf9651d4cd315ea21d24a964a3c36", + "sha256:5535dda5739257effef56e49a1c51c71f1d37a6e5607bb25a5eee507c59580d1", + "sha256:5a7524042014642b39b1fcae85fb37556c200e64ec90824ae9ecf7b667ccfc14", + "sha256:5f55028169ef85e1fa8e4b8b1b91c0b3b0fa3297c4fb22990d46ff01d22c2d6c", + "sha256:6694d5573e7790a0e8d3d177d7a416ca5f5c150742ee703f3c18df76260de794", + "sha256:6831e1ac20ac52634da606b658b0b2712d26984999c9d93f0c6e59fe62ca741b", + "sha256:71afc1f5cd72ab97330126b566bbf4e8661aab7449f08895d21a5d08c6b051ff", + "sha256:7349c27128334f787ae63ab49d90bf6d47c7288c63a0a5dfaa319d4b4541dd2c", + "sha256:77f0d9fa5e10d03aa4528436e33423bfa3718b86c646615f04616294c935f840", + "sha256:828ad813c7cdc2e71dcf141912c685bfe4b548c0e6d9540db6418b807c345ddd", + "sha256:859714036274a75e6e57c7bab0c47a4602d2a8cfaaa33bbdb68c8359b2ed4f5c", + "sha256:85a06c61598b14b015d4df233d249cd5abfa61084ef5b9f64a48e997fd829a82", + "sha256:869ef4a19f6e4c6987e18b315721b8b971f7048e6eaea29c066854242b4e98d9", + "sha256:8cb4febad0f0b26c6f62e1628f2053954ad2c555d67660f28dfb1b0496711952", + "sha256:977e2d9a646773cc7428cdd9a34b069d6ee254fadfb4d09b3f430e95472f3cf3", + "sha256:99bd767c49c775b79fdcd2eabff405f1063d9d959039c0bdd720527a7738748a", + "sha256:a5c58664b23b248b16b96253880b2868fb34358911400a7ba39d7f6399935389", + "sha256:aaa0f296e503cda4bc07566f592cd7a28779d433f3a23c48082af425d6d5a78f", + "sha256:ab235d9fe64833f12d1334d29b558aacedfbca2356dfb9691f2d0d38a8a7bfb4", + "sha256:b3b0c8f660fae65eac74fbf003f3103769b90012ae7a460863010539bb7a80da", + "sha256:bab8e6d510d2ea0f1d14f12642e3f35cefa47a9b2e4c7cea1852b52bc9c49647", + "sha256:c45297bbdbc8bb79b02cf41417d63352b70bcb76f1bbb1ee7d47b3e89e42f95d", + "sha256:d19bca47c8a01b92640c614a9147b081a1974f69168ecd494687c827109e8f42", + "sha256:d64b4340a0c488a9e79b66ec9f9d77d02b99b772c8b8afd46c1294c1d39ca478", + "sha256:da969da069a82bbb5300b59161d8d7c8d423bc4ccd3b410a9b4d8932aeefc14b", + "sha256:ed02c7539705696ecb7dc9d476d861f3904a8d2b7e894bd418994920935d36bb", + "sha256:ee5b8abc35b549012e03a7b1e86c09491457dba6c94112a2482b18589cc2bdb9" + ], + "version": "==4.5.2" }, "decorator": { "hashes": [ @@ -470,39 +540,39 @@ }, "psycopg2": { "hashes": [ - "sha256:0b9e48a1c1505699a64ac58815ca99104aacace8321e455072cee4f7fe7b2698", - "sha256:0f4c784e1b5a320efb434c66a50b8dd7e30a7dc047e8f45c0a8d2694bfe72781", - "sha256:0fdbaa32c9eb09ef09d425dc154628fca6fa69d2f7c1a33f889abb7e0efb3909", - "sha256:11fbf688d5c953c0a5ba625cc42dea9aeb2321942c7c5ed9341a68f865dc8cb1", - "sha256:19eaac4eb25ab078bd0f28304a0cb08702d120caadfe76bb1e6846ed1f68635e", - "sha256:3232ec1a3bf4dba97fbf9b03ce12e4b6c1d01ea3c85773903a67ced725728232", - "sha256:36f8f9c216fcca048006f6dd60e4d3e6f406afde26cfb99e063f137070139eaf", - "sha256:59c1a0e4f9abe970062ed35d0720935197800a7ef7a62b3a9e3a70588d9ca40b", - "sha256:6506c5ff88750948c28d41852c09c5d2a49f51f28c6d90cbf1b6808e18c64e88", - "sha256:6bc3e68ee16f571681b8c0b6d5c0a77bef3c589012352b3f0cf5520e674e9d01", - "sha256:6dbbd7aabbc861eec6b910522534894d9dbb507d5819bc982032c3ea2e974f51", - "sha256:6e737915de826650d1a5f7ff4ac6cf888a26f021a647390ca7bafdba0e85462b", - "sha256:6ed9b2cfe85abc720e8943c1808eeffd41daa73e18b7c1e1a228b0b91f768ccc", - "sha256:711ec617ba453fdfc66616db2520db3a6d9a891e3bf62ef9aba4c95bb4e61230", - "sha256:844dacdf7530c5c612718cf12bc001f59b2d9329d35b495f1ff25045161aa6af", - "sha256:86b52e146da13c896e50c5a3341a9448151f1092b1a4153e425d1e8b62fec508", - "sha256:985c06c2a0f227131733ae58d6a541a5bc8b665e7305494782bebdb74202b793", - "sha256:a86dfe45f4f9c55b1a2312ff20a59b30da8d39c0e8821d00018372a2a177098f", - "sha256:aa3cd07f7f7e3183b63d48300666f920828a9dbd7d7ec53d450df2c4953687a9", - "sha256:b1964ed645ef8317806d615d9ff006c0dadc09dfc54b99ae67f9ba7a1ec9d5d2", - "sha256:b2abbff9e4141484bb89b96eb8eae186d77bc6d5ffbec6b01783ee5c3c467351", - "sha256:cc33c3a90492e21713260095f02b12bee02b8d1f2c03a221d763ce04fa90e2e9", - "sha256:d7de3bf0986d777807611c36e809b77a13bf1888f5c8db0ebf24b47a52d10726", - "sha256:db5e3c52576cc5b93a959a03ccc3b02cb8f0af1fbbdc80645f7a215f0b864f3a", - "sha256:e168aa795ffbb11379c942cf95bf813c7db9aa55538eb61de8c6815e092416f5", - "sha256:e9ca911f8e2d3117e5241d5fa9aaa991cb22fb0792627eeada47425d706b5ec8", - "sha256:eccf962d41ca46e6326b97c8fe0a6687b58dfc1a5f6540ed071ff1474cea749e", - "sha256:efa19deae6b9e504a74347fe5e25c2cb9343766c489c2ae921b05f37338b18d1", - "sha256:f4b0460a21f784abe17b496f66e74157a6c36116fa86da8bf6aa028b9e8ad5fe", - "sha256:f93d508ca64d924d478fb11e272e09524698f0c581d9032e68958cfbdd41faef" + "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e", + "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237", + "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f", + "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300", + "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076", + "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221", + "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82", + "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92", + "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3", + "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6", + "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e", + "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae", + "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535", + "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f", + "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26", + "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7", + "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53", + "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b", + "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc", + "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee", + "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20", + "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d", + "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9", + "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244", + "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f", + "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b", + "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370", + "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9", + "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d", + "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0" ], "index": "pypi", - "version": "==2.7.5" + "version": "==2.7.6.1" }, "ptyprocess": { "hashes": [ @@ -535,11 +605,11 @@ }, "pytest": { "hashes": [ - "sha256:a9e5e8d7ab9d5b0747f37740276eb362e6a76275d76cebbb52c6049d93b475db", - "sha256:bf47e8ed20d03764f963f0070ff1c8fda6e2671fc5dd562a4d3b7148ad60f5ca" + "sha256:488c842647bbeb350029da10325cb40af0a9c7a2fdda45aeb1dda75b60048ffb", + "sha256:c055690dfefa744992f563e8c3a654089a6aa5b8092dded9b6fafbd70b2e45a7" ], "index": "pypi", - "version": "==3.9.3" + "version": "==4.0.0" }, "pytest-cov": { "hashes": [ @@ -558,19 +628,19 @@ }, "requests": { "hashes": [ - "sha256:99dcfdaaeb17caf6e526f32b6a7b780461512ab3f1d992187801694cba42770c", - "sha256:a84b8c9ab6239b578f22d1c21d51b696dcfe004032bb80ea832398d6909d7279" + "sha256:65b3a120e4329e33c9889db89c80976c5272f56ea92d3e74da8a463992e3ff54", + "sha256:ea881206e59f41dbd0bd445437d792e43906703fff75ca8ff43ccdb11f33f263" ], "index": "pypi", - "version": "==2.20.0" + "version": "==2.20.1" }, "responses": { "hashes": [ - "sha256:682fafb124e799eeee67ec15c9678d955a88affda5613b09788ef80c03987cf0", - "sha256:9b1c14871c66329f509711627e3de5779a2ae50bd532ac162297623424288756" + "sha256:16ad4a7a914f20792111157adf09c63a8dc37699c57d1ad20dbc281a4f5743fb", + "sha256:b9b31d9b1fcf6d48aea044c9fdd3d04199f6d227b0650c15d2566b0135bc1ed7" ], "index": "pypi", - "version": "==0.10.2" + "version": "==0.10.4" }, "six": { "hashes": [ diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py index f1bb3416..6ecc3ec6 100755 --- a/python/fatcat_harvest.py +++ b/python/fatcat_harvest.py @@ -3,11 +3,13 @@ import sys import argparse import datetime -from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker +from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker,\ + HarvestArxivWorker, HarvestPubmedWorker, HarvestDoajArticleWorker,\ + HarvestDoajJournalWorker def run_crossref(args): worker = HarvestCrossrefWorker( - args.kafka_hosts, + kafka_hosts=args.kafka_hosts, produce_topic="fatcat-{}.crossref".format(args.env), state_topic="fatcat-{}.crossref-state".format(args.env), contact_email=args.contact_email, @@ -17,7 +19,7 @@ def run_crossref(args): def run_datacite(args): worker = HarvestDataciteWorker( - args.kafka_hosts, + kafka_hosts=args.kafka_hosts, produce_topic="fatcat-{}.datacite".format(args.env), state_topic="fatcat-{}.datacite-state".format(args.env), contact_email=args.contact_email, @@ -25,6 +27,43 @@ def run_datacite(args): end_date=args.end_date) worker.run() +def run_arxiv(args): + worker = HarvestArxivWorker( + kafka_hosts=args.kafka_hosts, + produce_topic="fatcat-{}.arxiv".format(args.env), + state_topic="fatcat-{}.arxiv-state".format(args.env), + start_date=args.start_date, + end_date=args.end_date) + worker.run() + +def run_pubmed(args): + worker = HarvestPubmedWorker( + kafka_hosts=args.kafka_hosts, + produce_topic="fatcat-{}.pubmed".format(args.env), + state_topic="fatcat-{}.pubmed-state".format(args.env), + start_date=args.start_date, + end_date=args.end_date) + worker.run() + +def run_doaj_article(args): + worker = HarvestDoajArticleWorker( + kafka_hosts=args.kafka_hosts, + produce_topic="fatcat-{}.doaj-article".format(args.env), + state_topic="fatcat-{}.doaj-article-state".format(args.env), + start_date=args.start_date, + end_date=args.end_date) + worker.run() + +def run_doaj_journal(args): + worker = HarvestDoajJournalWorker( + kafka_hosts=args.kafka_hosts, + produce_topic="fatcat-{}.doaj-journal".format(args.env), + state_topic="fatcat-{}.doaj-journal-state".format(args.env), + start_date=args.start_date, + end_date=args.end_date) + worker.run() + + def mkdate(raw): return datetime.datetime.strptime(raw, "%Y-%m-%d").date() @@ -59,6 +98,18 @@ def main(): sub_datacite = subparsers.add_parser('datacite') sub_datacite.set_defaults(func=run_datacite) + sub_arxiv = subparsers.add_parser('arxiv') + sub_arxiv.set_defaults(func=run_arxiv) + + sub_pubmed = subparsers.add_parser('pubmed') + sub_pubmed.set_defaults(func=run_pubmed) + + # DOAJ stuff disabled because API range-requests are broken + #sub_doaj_article = subparsers.add_parser('doaj-article') + #sub_doaj_article.set_defaults(func=run_doaj_article) + #sub_doaj_journal = subparsers.add_parser('doaj-journal') + #sub_doaj_journal.set_defaults(func=run_doaj_journal) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") diff --git a/python/fatcat_tools/harvest/__init__.py b/python/fatcat_tools/harvest/__init__.py index 4de2cbde..7d814696 100644 --- a/python/fatcat_tools/harvest/__init__.py +++ b/python/fatcat_tools/harvest/__init__.py @@ -1,3 +1,5 @@ from .harvest_common import HarvestState from .doi_registrars import HarvestCrossrefWorker, HarvestDataciteWorker +from .oaipmh import HarvestArxivWorker, HarvestPubmedWorker,\ + HarvestDoajArticleWorker, HarvestDoajJournalWorker diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index d5e4b7ec..10492c17 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -10,15 +10,13 @@ import datetime from pykafka import KafkaClient from fatcat_tools.workers import most_recent_message -from .harvest_common import HarvestState +from .harvest_common import HarvestState, DATE_FMT # Skip pylint due to: # AttributeError: 'NoneType' object has no attribute 'scope' # in 'astroid/node_classes.py' # pylint: skip-file -DATE_FMT = "%Y-%m-%d" - class HarvestCrossrefWorker: """ @@ -68,7 +66,6 @@ class HarvestCrossrefWorker: self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks self.api_batch_size = 50 - # for crossref, it's "from-index-date" self.name = "Crossref" def params(self, date_str): @@ -86,6 +83,9 @@ class HarvestCrossrefWorker: params['cursor'] = resp['message']['next-cursor'] return params + def extract_key(self, obj): + return obj['DOI'].encode('utf-8') + def fetch_date(self, date): produce_topic = self.kafka.topics[self.produce_topic] @@ -112,7 +112,7 @@ class HarvestCrossrefWorker: self.extract_total(resp), http_resp.elapsed)) #print(json.dumps(resp)) for work in items: - producer.produce(json.dumps(work).encode('utf-8')) + producer.produce(json.dumps(work).encode('utf-8'), partition_key=self.extract_key(work)) if len(items) < self.api_batch_size: break params = self.update_params(params, resp) @@ -181,6 +181,9 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): def extract_total(self, resp): return resp['meta']['total'] + def extract_key(self, obj): + return obj['doi'].encode('utf-8') + def update_params(self, params, resp): params['page[number]'] = resp['meta']['page'] + 1 return params diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py new file mode 100644 index 00000000..c3cb90db --- /dev/null +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -0,0 +1,157 @@ + +""" +OAI-PMH protocol: + https://sickle.readthedocs.io/en/latest/ + +Pubmed + https://www.ncbi.nlm.nih.gov/pmc/tools/oai/ + https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm + https://github.com/titipata/pubmed_parser + +arxiv + some APIs work on a per-version basis, others do not + + http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv + http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw + +doaj + https://github.com/miku/doajfetch + +----- + +actually, just going to re-use https://github.com/miku/metha for OAI-PMH stuff + => shell script from cronjob + => call metha-sync daily + => metha-cat -since | kafkacat output + => echo "date" | kafkat state + => some shell trick (comm?) to find missing dates; for each, do metha-cat into kafka + +or, just skip kafka for this stuff for now? hrm. + +crossref-like stuff is far enough along to keep + +## More Miku Magic! + +wowa, JSTOR KBART files! + http://www.jstor.org/kbart/collections/all-archive-titles + +https://github.com/miku/ldjtab: faster than jq for just grabbing + +sort can be told how much memory to use; eg: `sort -S50%`, and threads to use + +""" + +import re +import sys +import csv +import json +import time +import requests +import itertools +import datetime +from pykafka import KafkaClient +import sickle + +from fatcat_tools.workers import most_recent_message +from .harvest_common import HarvestState, DATE_FMT + + +class HarvestOaiPmhWorker: + """ + Base class for OAI-PMH harvesters. + + Based on Crossref importer + """ + + + def __init__(self, kafka_hosts, produce_topic, state_topic, + start_date=None, end_date=None): + + self.produce_topic = produce_topic + self.state_topic = state_topic + self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0") + + self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks + + self.endpoint_url = None # needs override + self.metadata_prefix = None # needs override + self.state = HarvestState(start_date, end_date) + self.state.initialize_from_kafka(self.kafka.topics[self.state_topic]) + + + def fetch_date(self, date): + + api = sickle.Sickle(self.endpoint_url) + date_str = date.strftime(DATE_FMT) + produce_topic = self.kafka.topics[self.produce_topic] + # this dict kwargs hack is to work around 'from' as a reserved python keyword + # recommended by sickle docs + records = api.ListRecords(**{ + 'metadataPrefix': self.metadata_prefix, + 'from': date_str, + 'until': date_str, + }) + + count = 0 + with produce_topic.get_producer() as producer: + for item in records: + count += 1 + if count % 50 == 0: + print("... up to {}".format(count)) + producer.produce(item.raw.encode('utf-8'), partition_key=item.header.identifier.encode('utf-8')) + + def run(self, continuous=False): + + while True: + current = self.state.next(continuous) + if current: + print("Fetching DOIs updated on {} (UTC)".format(current)) + self.fetch_date(current) + self.state.complete(current, kafka_topic=self.kafka.topics[self.state_topic]) + continue + + if continuous: + print("Sleeping {} seconds...".format(self.loop_sleep)) + time.sleep(self.loop_sleep()) + else: + break + print("{} DOI ingest caught up".format(self.name)) + + +class HarvestArxivWorker(HarvestOaiPmhWorker): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.endpoint_url = "https://export.arxiv.org/oai2" + self.metadata_prefix = "arXiv" + + +class HarvestPubmedWorker(HarvestOaiPmhWorker): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi" + self.metadata_prefix = "pmc_fm" + + +class HarvestDoajJournalWorker(HarvestOaiPmhWorker): + """ + WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.endpoint_url = "https://www.doaj.org/oai" + self.metadata_prefix = "oai_dc" + + +class HarvestDoajArticleWorker(HarvestOaiPmhWorker): + """ + WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.endpoint_url = "https://www.doaj.org/oai.article" + self.metadata_prefix = "oai_doaj" + -- cgit v1.2.3