summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-19 23:04:18 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-19 23:04:18 -0800
commite590eec544ab6f2e54e8770f01e64eef3158fdaa (patch)
tree5f1fe36a489e7e42642d96a3a719dcbd74d60901 /python
parent65bdebea35f2ab3c9c8b0f8a8b0a9a577a36bee2 (diff)
downloadfatcat-e590eec544ab6f2e54e8770f01e64eef3158fdaa.tar.gz
fatcat-e590eec544ab6f2e54e8770f01e64eef3158fdaa.zip
initial OAI-PMH harvesters
Diffstat (limited to 'python')
-rw-r--r--python/Pipfile1
-rw-r--r--python/Pipfile.lock320
-rwxr-xr-xpython/fatcat_harvest.py57
-rw-r--r--python/fatcat_tools/harvest/__init__.py2
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py13
-rw-r--r--python/fatcat_tools/harvest/oaipmh.py157
6 files changed, 417 insertions, 133 deletions
diff --git a/python/Pipfile b/python/Pipfile
index f4137dca..04b5c5ef 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -31,6 +31,7 @@ flask-uuid = "*"
flask-debugtoolbar = "*"
pykafka = "*"
python-dateutil = "*"
+sickle = "*"
[requires]
# Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index da96a24f..fa597faa 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "c99945057fc87c7a825a76cfe4d1abdcb99d0e70dc71db770cf259b761c6835c"
+ "sha256": "f3c3ce52dc614cb4514b4d637732e705815403f9181c142aea59a7feaa51f355"
},
"pipfile-spec": 6,
"requires": {
@@ -110,11 +110,73 @@
],
"version": "==2.5.0"
},
+ "lxml": {
+ "hashes": [
+ "sha256:02bc220d61f46e9b9d5a53c361ef95e9f5e1d27171cd461dddb17677ae2289a5",
+ "sha256:22f253b542a342755f6cfc047fe4d3a296515cf9b542bc6e261af45a80b8caf6",
+ "sha256:2f31145c7ff665b330919bfa44aacd3a0211a76ca7e7b441039d2a0b0451e415",
+ "sha256:36720698c29e7a9626a0dc802ef8885f8f0239bfd1689628ecd459a061f2807f",
+ "sha256:438a1b0203545521f6616132bfe0f4bca86f8a401364008b30e2b26ec408ce85",
+ "sha256:4815892904c336bbaf73dafd54f45f69f4021c22b5bad7332176bbf4fb830568",
+ "sha256:5be031b0f15ad63910d8e5038b489d95a79929513b3634ad4babf77100602588",
+ "sha256:5c93ae37c3c588e829b037fdfbd64a6e40c901d3f93f7beed6d724c44829a3ad",
+ "sha256:60842230678674cdac4a1cf0f707ef12d75b9a4fc4a565add4f710b5fcf185d5",
+ "sha256:62939a8bb6758d1bf923aa1c13f0bcfa9bf5b2fc0f5fa917a6e25db5fe0cfa4e",
+ "sha256:75830c06a62fe7b8fe3bbb5f269f0b308f19f3949ac81cfd40062f47c1455faf",
+ "sha256:81992565b74332c7c1aff6a913a3e906771aa81c9d0c68c68113cffcae45bc53",
+ "sha256:8c892fb0ee52c594d9a7751c7d7356056a9682674b92cc1c4dc968ff0f30c52f",
+ "sha256:9d862e3cf4fc1f2837dedce9c42269c8c76d027e49820a548ac89fdcee1e361f",
+ "sha256:a623965c086a6e91bb703d4da62dabe59fe88888e82c4117d544e11fd74835d6",
+ "sha256:a7783ab7f6a508b0510490cef9f857b763d796ba7476d9703f89722928d1e113",
+ "sha256:aab09fbe8abfa3b9ce62aaf45aca2d28726b1b9ee44871dbe644050a2fff4940",
+ "sha256:abf181934ac3ef193832fb973fd7f6149b5c531903c2ec0f1220941d73eee601",
+ "sha256:ae07fa0c115733fce1e9da96a3ac3fa24801742ca17e917e0c79d63a01eeb843",
+ "sha256:b9c78242219f674ab645ec571c9a95d70f381319a23911941cd2358a8e0521cf",
+ "sha256:bccb267678b870d9782c3b44d0cefe3ba0e329f9af8c946d32bf3778e7a4f271",
+ "sha256:c4df4d27f4c93b2cef74579f00b1d3a31a929c7d8023f870c4b476f03a274db4",
+ "sha256:caf0e50b546bb60dfa99bb18dfa6748458a83131ecdceaf5c071d74907e7e78a",
+ "sha256:d3266bd3ac59ac4edcd5fa75165dee80b94a3e5c91049df5f7c057ccf097551c",
+ "sha256:db0d213987bcd4e6d41710fb4532b22315b0d8fb439ff901782234456556aed1",
+ "sha256:dbbd5cf7690a40a9f0a9325ab480d0fccf46d16b378eefc08e195d84299bfae1",
+ "sha256:e16e07a0ec3a75b5ee61f2b1003c35696738f937dc8148fbda9fe2147ccb6e61",
+ "sha256:e175a006725c7faadbe69e791877d09936c0ef2cf49d01b60a6c1efcb0e8be6f",
+ "sha256:edd9c13a97f6550f9da2236126bb51c092b3b1ce6187f2bd966533ad794bbb5e",
+ "sha256:fa39ea60d527fbdd94215b5e5552f1c6a912624521093f1384a491a8ad89ad8b"
+ ],
+ "version": "==4.2.5"
+ },
"markupsafe": {
"hashes": [
- "sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665"
+ "sha256:048ef924c1623740e70204aa7143ec592504045ae4429b59c30054cb31e3c432",
+ "sha256:130f844e7f5bdd8e9f3f42e7102ef1d49b2e6fdf0d7526df3f87281a532d8c8b",
+ "sha256:19f637c2ac5ae9da8bfd98cef74d64b7e1bb8a63038a3505cd182c3fac5eb4d9",
+ "sha256:1b8a7a87ad1b92bd887568ce54b23565f3fd7018c4180136e1cf412b405a47af",
+ "sha256:1c25694ca680b6919de53a4bb3bdd0602beafc63ff001fea2f2fc16ec3a11834",
+ "sha256:1f19ef5d3908110e1e891deefb5586aae1b49a7440db952454b4e281b41620cd",
+ "sha256:1fa6058938190ebe8290e5cae6c351e14e7bb44505c4a7624555ce57fbbeba0d",
+ "sha256:31cbb1359e8c25f9f48e156e59e2eaad51cd5242c05ed18a8de6dbe85184e4b7",
+ "sha256:3e835d8841ae7863f64e40e19477f7eb398674da6a47f09871673742531e6f4b",
+ "sha256:4e97332c9ce444b0c2c38dd22ddc61c743eb208d916e4265a2a3b575bdccb1d3",
+ "sha256:525396ee324ee2da82919f2ee9c9e73b012f23e7640131dd1b53a90206a0f09c",
+ "sha256:52b07fbc32032c21ad4ab060fec137b76eb804c4b9a1c7c7dc562549306afad2",
+ "sha256:52ccb45e77a1085ec5461cde794e1aa037df79f473cbc69b974e73940655c8d7",
+ "sha256:5c3fbebd7de20ce93103cb3183b47671f2885307df4a17a0ad56a1dd51273d36",
+ "sha256:5e5851969aea17660e55f6a3be00037a25b96a9b44d2083651812c99d53b14d1",
+ "sha256:5edfa27b2d3eefa2210fb2f5d539fbed81722b49f083b2c6566455eb7422fd7e",
+ "sha256:7d263e5770efddf465a9e31b78362d84d015cc894ca2c131901a4445eaa61ee1",
+ "sha256:83381342bfc22b3c8c06f2dd93a505413888694302de25add756254beee8449c",
+ "sha256:857eebb2c1dc60e4219ec8e98dfa19553dae33608237e107db9c6078b1167856",
+ "sha256:98e439297f78fca3a6169fd330fbe88d78b3bb72f967ad9961bcac0d7fdd1550",
+ "sha256:bf54103892a83c64db58125b3f2a43df6d2cb2d28889f14c78519394feb41492",
+ "sha256:d9ac82be533394d341b41d78aca7ed0e0f4ba5a2231602e2f05aa87f25c51672",
+ "sha256:e982fe07ede9fada6ff6705af70514a52beb1b2c3d25d4e873e82114cf3c5401",
+ "sha256:edce2ea7f3dfc981c4ddc97add8a61381d9642dc3273737e756517cc03e84dd6",
+ "sha256:efdc45ef1afc238db84cb4963aa689c0408912a0239b0721cb172b4016eb31d6",
+ "sha256:f137c02498f8b935892d5c0172560d7ab54bc45039de8805075e19079c639a9c",
+ "sha256:f82e347a72f955b7017a39708a3667f106e6ad4d10b25f237396a7115d8ed5fd",
+ "sha256:fb7c206e01ad85ce57feeaaa0bf784b97fa3cad0d4a5737bc5295785f5c613a1"
],
- "version": "==1.0"
+ "version": "==1.1.0"
},
"marshmallow": {
"hashes": [
@@ -125,47 +187,47 @@
},
"marshmallow-sqlalchemy": {
"hashes": [
- "sha256:a42cdbd6b623059fca601e1b572cab28f00d4acf36e2cef38094c88424b3dcf1",
- "sha256:aacb0a7e0f6b5d489cdb3c10d1ab420f74c21538838026337738e4c6e8848fd8"
+ "sha256:1a4813bbcd2a34f10b1fcad5f4ed85355739f39edb223e6cf68a95bd75807885",
+ "sha256:5fc53b6fac10c3e0d0c3e1ba19312860b54534ffc56bc5d9615bf680f35a18de"
],
"index": "pypi",
- "version": "==0.14.1"
+ "version": "==0.15.0"
},
"psycopg2": {
"hashes": [
- "sha256:0b9e48a1c1505699a64ac58815ca99104aacace8321e455072cee4f7fe7b2698",
- "sha256:0f4c784e1b5a320efb434c66a50b8dd7e30a7dc047e8f45c0a8d2694bfe72781",
- "sha256:0fdbaa32c9eb09ef09d425dc154628fca6fa69d2f7c1a33f889abb7e0efb3909",
- "sha256:11fbf688d5c953c0a5ba625cc42dea9aeb2321942c7c5ed9341a68f865dc8cb1",
- "sha256:19eaac4eb25ab078bd0f28304a0cb08702d120caadfe76bb1e6846ed1f68635e",
- "sha256:3232ec1a3bf4dba97fbf9b03ce12e4b6c1d01ea3c85773903a67ced725728232",
- "sha256:36f8f9c216fcca048006f6dd60e4d3e6f406afde26cfb99e063f137070139eaf",
- "sha256:59c1a0e4f9abe970062ed35d0720935197800a7ef7a62b3a9e3a70588d9ca40b",
- "sha256:6506c5ff88750948c28d41852c09c5d2a49f51f28c6d90cbf1b6808e18c64e88",
- "sha256:6bc3e68ee16f571681b8c0b6d5c0a77bef3c589012352b3f0cf5520e674e9d01",
- "sha256:6dbbd7aabbc861eec6b910522534894d9dbb507d5819bc982032c3ea2e974f51",
- "sha256:6e737915de826650d1a5f7ff4ac6cf888a26f021a647390ca7bafdba0e85462b",
- "sha256:6ed9b2cfe85abc720e8943c1808eeffd41daa73e18b7c1e1a228b0b91f768ccc",
- "sha256:711ec617ba453fdfc66616db2520db3a6d9a891e3bf62ef9aba4c95bb4e61230",
- "sha256:844dacdf7530c5c612718cf12bc001f59b2d9329d35b495f1ff25045161aa6af",
- "sha256:86b52e146da13c896e50c5a3341a9448151f1092b1a4153e425d1e8b62fec508",
- "sha256:985c06c2a0f227131733ae58d6a541a5bc8b665e7305494782bebdb74202b793",
- "sha256:a86dfe45f4f9c55b1a2312ff20a59b30da8d39c0e8821d00018372a2a177098f",
- "sha256:aa3cd07f7f7e3183b63d48300666f920828a9dbd7d7ec53d450df2c4953687a9",
- "sha256:b1964ed645ef8317806d615d9ff006c0dadc09dfc54b99ae67f9ba7a1ec9d5d2",
- "sha256:b2abbff9e4141484bb89b96eb8eae186d77bc6d5ffbec6b01783ee5c3c467351",
- "sha256:cc33c3a90492e21713260095f02b12bee02b8d1f2c03a221d763ce04fa90e2e9",
- "sha256:d7de3bf0986d777807611c36e809b77a13bf1888f5c8db0ebf24b47a52d10726",
- "sha256:db5e3c52576cc5b93a959a03ccc3b02cb8f0af1fbbdc80645f7a215f0b864f3a",
- "sha256:e168aa795ffbb11379c942cf95bf813c7db9aa55538eb61de8c6815e092416f5",
- "sha256:e9ca911f8e2d3117e5241d5fa9aaa991cb22fb0792627eeada47425d706b5ec8",
- "sha256:eccf962d41ca46e6326b97c8fe0a6687b58dfc1a5f6540ed071ff1474cea749e",
- "sha256:efa19deae6b9e504a74347fe5e25c2cb9343766c489c2ae921b05f37338b18d1",
- "sha256:f4b0460a21f784abe17b496f66e74157a6c36116fa86da8bf6aa028b9e8ad5fe",
- "sha256:f93d508ca64d924d478fb11e272e09524698f0c581d9032e68958cfbdd41faef"
+ "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e",
+ "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237",
+ "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f",
+ "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300",
+ "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076",
+ "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221",
+ "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82",
+ "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92",
+ "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3",
+ "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6",
+ "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e",
+ "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae",
+ "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535",
+ "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f",
+ "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26",
+ "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7",
+ "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53",
+ "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b",
+ "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc",
+ "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee",
+ "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20",
+ "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d",
+ "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9",
+ "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244",
+ "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f",
+ "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b",
+ "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370",
+ "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9",
+ "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d",
+ "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0"
],
"index": "pypi",
- "version": "==2.7.5"
+ "version": "==2.7.6.1"
},
"pykafka": {
"hashes": [
@@ -193,11 +255,19 @@
},
"requests": {
"hashes": [
- "sha256:99dcfdaaeb17caf6e526f32b6a7b780461512ab3f1d992187801694cba42770c",
- "sha256:a84b8c9ab6239b578f22d1c21d51b696dcfe004032bb80ea832398d6909d7279"
+ "sha256:65b3a120e4329e33c9889db89c80976c5272f56ea92d3e74da8a463992e3ff54",
+ "sha256:ea881206e59f41dbd0bd445437d792e43906703fff75ca8ff43ccdb11f33f263"
],
"index": "pypi",
- "version": "==2.20.0"
+ "version": "==2.20.1"
+ },
+ "sickle": {
+ "hashes": [
+ "sha256:76d66ed4607af2cd36ee15568a98e7f147d4ec3dd227bd047664a1ca88b21944",
+ "sha256:b0aaa41d97a0c355aa6099b4dfa46c03f0bf828e6171960a15d68bd0548215ec"
+ ],
+ "index": "pypi",
+ "version": "==0.6.4"
},
"six": {
"hashes": [
@@ -208,10 +278,10 @@
},
"sqlalchemy": {
"hashes": [
- "sha256:84412de3794acee05630e7788f25e80e81f78eb4837e7b71d0499129f660486a"
+ "sha256:9de7c7dabcf06319becdb7e15099c44e5e34ba7062f9ba10bc00e562f5db3d04"
],
"index": "pypi",
- "version": "==1.2.13"
+ "version": "==1.2.14"
},
"tabulate": {
"hashes": [
@@ -279,49 +349,49 @@
},
"coverage": {
"hashes": [
- "sha256:03481e81d558d30d230bc12999e3edffe392d244349a90f4ef9b88425fac74ba",
- "sha256:0b136648de27201056c1869a6c0d4e23f464750fd9a9ba9750b8336a244429ed",
- "sha256:0bf8cbbd71adfff0ef1f3a1531e6402d13b7b01ac50a79c97ca15f030dba6306",
- "sha256:104ab3934abaf5be871a583541e8829d6c19ce7bde2923b2751e0d3ca44db60a",
- "sha256:10a46017fef60e16694a30627319f38a2b9b52e90182dddb6e37dcdab0f4bf95",
- "sha256:15b111b6a0f46ee1a485414a52a7ad1d703bdf984e9ed3c288a4414d3871dcbd",
- "sha256:198626739a79b09fa0a2f06e083ffd12eb55449b5f8bfdbeed1df4910b2ca640",
- "sha256:1c383d2ef13ade2acc636556fd544dba6e14fa30755f26812f54300e401f98f2",
- "sha256:23d341cdd4a0371820eb2b0bd6b88f5003a7438bbedb33688cd33b8eae59affd",
- "sha256:28b2191e7283f4f3568962e373b47ef7f0392993bb6660d079c62bd50fe9d162",
- "sha256:2a5b73210bad5279ddb558d9a2bfedc7f4bf6ad7f3c988641d83c40293deaec1",
- "sha256:2eb564bbf7816a9d68dd3369a510be3327f1c618d2357fa6b1216994c2e3d508",
- "sha256:337ded681dd2ef9ca04ef5d93cfc87e52e09db2594c296b4a0a3662cb1b41249",
- "sha256:3a2184c6d797a125dca8367878d3b9a178b6fdd05fdc2d35d758c3006a1cd694",
- "sha256:3c79a6f7b95751cdebcd9037e4d06f8d5a9b60e4ed0cd231342aa8ad7124882a",
- "sha256:3d72c20bd105022d29b14a7d628462ebdc61de2f303322c0212a054352f3b287",
- "sha256:3eb42bf89a6be7deb64116dd1cc4b08171734d721e7a7e57ad64cc4ef29ed2f1",
- "sha256:4635a184d0bbe537aa185a34193898eee409332a8ccb27eea36f262566585000",
- "sha256:56e448f051a201c5ebbaa86a5efd0ca90d327204d8b059ab25ad0f35fbfd79f1",
- "sha256:5a13ea7911ff5e1796b6d5e4fbbf6952381a611209b736d48e675c2756f3f74e",
- "sha256:69bf008a06b76619d3c3f3b1983f5145c75a305a0fea513aca094cae5c40a8f5",
- "sha256:6bc583dc18d5979dc0f6cec26a8603129de0304d5ae1f17e57a12834e7235062",
- "sha256:701cd6093d63e6b8ad7009d8a92425428bc4d6e7ab8d75efbb665c806c1d79ba",
- "sha256:7608a3dd5d73cb06c531b8925e0ef8d3de31fed2544a7de6c63960a1e73ea4bc",
- "sha256:76ecd006d1d8f739430ec50cc872889af1f9c1b6b8f48e29941814b09b0fd3cc",
- "sha256:7aa36d2b844a3e4a4b356708d79fd2c260281a7390d678a10b91ca595ddc9e99",
- "sha256:7d3f553904b0c5c016d1dad058a7554c7ac4c91a789fca496e7d8347ad040653",
- "sha256:7e1fe19bd6dce69d9fd159d8e4a80a8f52101380d5d3a4d374b6d3eae0e5de9c",
- "sha256:8c3cb8c35ec4d9506979b4cf90ee9918bc2e49f84189d9bf5c36c0c1119c6558",
- "sha256:9d6dd10d49e01571bf6e147d3b505141ffc093a06756c60b053a859cb2128b1f",
- "sha256:9e112fcbe0148a6fa4f0a02e8d58e94470fc6cb82a5481618fea901699bf34c4",
- "sha256:ac4fef68da01116a5c117eba4dd46f2e06847a497de5ed1d64bb99a5fda1ef91",
- "sha256:b8815995e050764c8610dbc82641807d196927c3dbed207f0a079833ffcf588d",
- "sha256:be6cfcd8053d13f5f5eeb284aa8a814220c3da1b0078fa859011c7fffd86dab9",
- "sha256:c1bb572fab8208c400adaf06a8133ac0712179a334c09224fb11393e920abcdd",
- "sha256:de4418dadaa1c01d497e539210cb6baa015965526ff5afc078c57ca69160108d",
- "sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6",
- "sha256:e4d96c07229f58cb686120f168276e434660e4358cc9cf3b0464210b04913e77",
- "sha256:f05a636b4564104120111800021a92e43397bc12a5c72fed7036be8556e0029e",
- "sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80",
- "sha256:f8a923a85cb099422ad5a2e345fe877bbc89a8a8b23235824a93488150e45f6e"
- ],
- "version": "==4.5.1"
+ "sha256:06123b58a1410873e22134ca2d88bd36680479fe354955b3579fb8ff150e4d27",
+ "sha256:09e47c529ff77bf042ecfe858fb55c3e3eb97aac2c87f0349ab5a7efd6b3939f",
+ "sha256:0a1f9b0eb3aa15c990c328535655847b3420231af299386cfe5efc98f9c250fe",
+ "sha256:0cc941b37b8c2ececfed341444a456912e740ecf515d560de58b9a76562d966d",
+ "sha256:0d34245f824cc3140150ab7848d08b7e2ba67ada959d77619c986f2062e1f0e8",
+ "sha256:10e8af18d1315de936d67775d3a814cc81d0747a1a0312d84e27ae5610e313b0",
+ "sha256:1b4276550b86caa60606bd3572b52769860a81a70754a54acc8ba789ce74d607",
+ "sha256:1e8a2627c48266c7b813975335cfdea58c706fe36f607c97d9392e61502dc79d",
+ "sha256:258b21c5cafb0c3768861a6df3ab0cfb4d8b495eee5ec660e16f928bf7385390",
+ "sha256:2b224052bfd801beb7478b03e8a66f3f25ea56ea488922e98903914ac9ac930b",
+ "sha256:3ad59c84c502cd134b0088ca9038d100e8fb5081bbd5ccca4863f3804d81f61d",
+ "sha256:447c450a093766744ab53bf1e7063ec82866f27bcb4f4c907da25ad293bba7e3",
+ "sha256:46101fc20c6f6568561cdd15a54018bb42980954b79aa46da8ae6f008066a30e",
+ "sha256:4710dc676bb4b779c4361b54eb308bc84d64a2fa3d78e5f7228921eccce5d815",
+ "sha256:510986f9a280cd05189b42eee2b69fecdf5bf9651d4cd315ea21d24a964a3c36",
+ "sha256:5535dda5739257effef56e49a1c51c71f1d37a6e5607bb25a5eee507c59580d1",
+ "sha256:5a7524042014642b39b1fcae85fb37556c200e64ec90824ae9ecf7b667ccfc14",
+ "sha256:5f55028169ef85e1fa8e4b8b1b91c0b3b0fa3297c4fb22990d46ff01d22c2d6c",
+ "sha256:6694d5573e7790a0e8d3d177d7a416ca5f5c150742ee703f3c18df76260de794",
+ "sha256:6831e1ac20ac52634da606b658b0b2712d26984999c9d93f0c6e59fe62ca741b",
+ "sha256:71afc1f5cd72ab97330126b566bbf4e8661aab7449f08895d21a5d08c6b051ff",
+ "sha256:7349c27128334f787ae63ab49d90bf6d47c7288c63a0a5dfaa319d4b4541dd2c",
+ "sha256:77f0d9fa5e10d03aa4528436e33423bfa3718b86c646615f04616294c935f840",
+ "sha256:828ad813c7cdc2e71dcf141912c685bfe4b548c0e6d9540db6418b807c345ddd",
+ "sha256:859714036274a75e6e57c7bab0c47a4602d2a8cfaaa33bbdb68c8359b2ed4f5c",
+ "sha256:85a06c61598b14b015d4df233d249cd5abfa61084ef5b9f64a48e997fd829a82",
+ "sha256:869ef4a19f6e4c6987e18b315721b8b971f7048e6eaea29c066854242b4e98d9",
+ "sha256:8cb4febad0f0b26c6f62e1628f2053954ad2c555d67660f28dfb1b0496711952",
+ "sha256:977e2d9a646773cc7428cdd9a34b069d6ee254fadfb4d09b3f430e95472f3cf3",
+ "sha256:99bd767c49c775b79fdcd2eabff405f1063d9d959039c0bdd720527a7738748a",
+ "sha256:a5c58664b23b248b16b96253880b2868fb34358911400a7ba39d7f6399935389",
+ "sha256:aaa0f296e503cda4bc07566f592cd7a28779d433f3a23c48082af425d6d5a78f",
+ "sha256:ab235d9fe64833f12d1334d29b558aacedfbca2356dfb9691f2d0d38a8a7bfb4",
+ "sha256:b3b0c8f660fae65eac74fbf003f3103769b90012ae7a460863010539bb7a80da",
+ "sha256:bab8e6d510d2ea0f1d14f12642e3f35cefa47a9b2e4c7cea1852b52bc9c49647",
+ "sha256:c45297bbdbc8bb79b02cf41417d63352b70bcb76f1bbb1ee7d47b3e89e42f95d",
+ "sha256:d19bca47c8a01b92640c614a9147b081a1974f69168ecd494687c827109e8f42",
+ "sha256:d64b4340a0c488a9e79b66ec9f9d77d02b99b772c8b8afd46c1294c1d39ca478",
+ "sha256:da969da069a82bbb5300b59161d8d7c8d423bc4ccd3b410a9b4d8932aeefc14b",
+ "sha256:ed02c7539705696ecb7dc9d476d861f3904a8d2b7e894bd418994920935d36bb",
+ "sha256:ee5b8abc35b549012e03a7b1e86c09491457dba6c94112a2482b18589cc2bdb9"
+ ],
+ "version": "==4.5.2"
},
"decorator": {
"hashes": [
@@ -470,39 +540,39 @@
},
"psycopg2": {
"hashes": [
- "sha256:0b9e48a1c1505699a64ac58815ca99104aacace8321e455072cee4f7fe7b2698",
- "sha256:0f4c784e1b5a320efb434c66a50b8dd7e30a7dc047e8f45c0a8d2694bfe72781",
- "sha256:0fdbaa32c9eb09ef09d425dc154628fca6fa69d2f7c1a33f889abb7e0efb3909",
- "sha256:11fbf688d5c953c0a5ba625cc42dea9aeb2321942c7c5ed9341a68f865dc8cb1",
- "sha256:19eaac4eb25ab078bd0f28304a0cb08702d120caadfe76bb1e6846ed1f68635e",
- "sha256:3232ec1a3bf4dba97fbf9b03ce12e4b6c1d01ea3c85773903a67ced725728232",
- "sha256:36f8f9c216fcca048006f6dd60e4d3e6f406afde26cfb99e063f137070139eaf",
- "sha256:59c1a0e4f9abe970062ed35d0720935197800a7ef7a62b3a9e3a70588d9ca40b",
- "sha256:6506c5ff88750948c28d41852c09c5d2a49f51f28c6d90cbf1b6808e18c64e88",
- "sha256:6bc3e68ee16f571681b8c0b6d5c0a77bef3c589012352b3f0cf5520e674e9d01",
- "sha256:6dbbd7aabbc861eec6b910522534894d9dbb507d5819bc982032c3ea2e974f51",
- "sha256:6e737915de826650d1a5f7ff4ac6cf888a26f021a647390ca7bafdba0e85462b",
- "sha256:6ed9b2cfe85abc720e8943c1808eeffd41daa73e18b7c1e1a228b0b91f768ccc",
- "sha256:711ec617ba453fdfc66616db2520db3a6d9a891e3bf62ef9aba4c95bb4e61230",
- "sha256:844dacdf7530c5c612718cf12bc001f59b2d9329d35b495f1ff25045161aa6af",
- "sha256:86b52e146da13c896e50c5a3341a9448151f1092b1a4153e425d1e8b62fec508",
- "sha256:985c06c2a0f227131733ae58d6a541a5bc8b665e7305494782bebdb74202b793",
- "sha256:a86dfe45f4f9c55b1a2312ff20a59b30da8d39c0e8821d00018372a2a177098f",
- "sha256:aa3cd07f7f7e3183b63d48300666f920828a9dbd7d7ec53d450df2c4953687a9",
- "sha256:b1964ed645ef8317806d615d9ff006c0dadc09dfc54b99ae67f9ba7a1ec9d5d2",
- "sha256:b2abbff9e4141484bb89b96eb8eae186d77bc6d5ffbec6b01783ee5c3c467351",
- "sha256:cc33c3a90492e21713260095f02b12bee02b8d1f2c03a221d763ce04fa90e2e9",
- "sha256:d7de3bf0986d777807611c36e809b77a13bf1888f5c8db0ebf24b47a52d10726",
- "sha256:db5e3c52576cc5b93a959a03ccc3b02cb8f0af1fbbdc80645f7a215f0b864f3a",
- "sha256:e168aa795ffbb11379c942cf95bf813c7db9aa55538eb61de8c6815e092416f5",
- "sha256:e9ca911f8e2d3117e5241d5fa9aaa991cb22fb0792627eeada47425d706b5ec8",
- "sha256:eccf962d41ca46e6326b97c8fe0a6687b58dfc1a5f6540ed071ff1474cea749e",
- "sha256:efa19deae6b9e504a74347fe5e25c2cb9343766c489c2ae921b05f37338b18d1",
- "sha256:f4b0460a21f784abe17b496f66e74157a6c36116fa86da8bf6aa028b9e8ad5fe",
- "sha256:f93d508ca64d924d478fb11e272e09524698f0c581d9032e68958cfbdd41faef"
+ "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e",
+ "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237",
+ "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f",
+ "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300",
+ "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076",
+ "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221",
+ "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82",
+ "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92",
+ "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3",
+ "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6",
+ "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e",
+ "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae",
+ "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535",
+ "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f",
+ "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26",
+ "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7",
+ "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53",
+ "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b",
+ "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc",
+ "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee",
+ "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20",
+ "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d",
+ "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9",
+ "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244",
+ "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f",
+ "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b",
+ "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370",
+ "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9",
+ "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d",
+ "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0"
],
"index": "pypi",
- "version": "==2.7.5"
+ "version": "==2.7.6.1"
},
"ptyprocess": {
"hashes": [
@@ -535,11 +605,11 @@
},
"pytest": {
"hashes": [
- "sha256:a9e5e8d7ab9d5b0747f37740276eb362e6a76275d76cebbb52c6049d93b475db",
- "sha256:bf47e8ed20d03764f963f0070ff1c8fda6e2671fc5dd562a4d3b7148ad60f5ca"
+ "sha256:488c842647bbeb350029da10325cb40af0a9c7a2fdda45aeb1dda75b60048ffb",
+ "sha256:c055690dfefa744992f563e8c3a654089a6aa5b8092dded9b6fafbd70b2e45a7"
],
"index": "pypi",
- "version": "==3.9.3"
+ "version": "==4.0.0"
},
"pytest-cov": {
"hashes": [
@@ -558,19 +628,19 @@
},
"requests": {
"hashes": [
- "sha256:99dcfdaaeb17caf6e526f32b6a7b780461512ab3f1d992187801694cba42770c",
- "sha256:a84b8c9ab6239b578f22d1c21d51b696dcfe004032bb80ea832398d6909d7279"
+ "sha256:65b3a120e4329e33c9889db89c80976c5272f56ea92d3e74da8a463992e3ff54",
+ "sha256:ea881206e59f41dbd0bd445437d792e43906703fff75ca8ff43ccdb11f33f263"
],
"index": "pypi",
- "version": "==2.20.0"
+ "version": "==2.20.1"
},
"responses": {
"hashes": [
- "sha256:682fafb124e799eeee67ec15c9678d955a88affda5613b09788ef80c03987cf0",
- "sha256:9b1c14871c66329f509711627e3de5779a2ae50bd532ac162297623424288756"
+ "sha256:16ad4a7a914f20792111157adf09c63a8dc37699c57d1ad20dbc281a4f5743fb",
+ "sha256:b9b31d9b1fcf6d48aea044c9fdd3d04199f6d227b0650c15d2566b0135bc1ed7"
],
"index": "pypi",
- "version": "==0.10.2"
+ "version": "==0.10.4"
},
"six": {
"hashes": [
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index f1bb3416..6ecc3ec6 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -3,11 +3,13 @@
import sys
import argparse
import datetime
-from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker
+from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker,\
+ HarvestArxivWorker, HarvestPubmedWorker, HarvestDoajArticleWorker,\
+ HarvestDoajJournalWorker
def run_crossref(args):
worker = HarvestCrossrefWorker(
- args.kafka_hosts,
+ kafka_hosts=args.kafka_hosts,
produce_topic="fatcat-{}.crossref".format(args.env),
state_topic="fatcat-{}.crossref-state".format(args.env),
contact_email=args.contact_email,
@@ -17,7 +19,7 @@ def run_crossref(args):
def run_datacite(args):
worker = HarvestDataciteWorker(
- args.kafka_hosts,
+ kafka_hosts=args.kafka_hosts,
produce_topic="fatcat-{}.datacite".format(args.env),
state_topic="fatcat-{}.datacite-state".format(args.env),
contact_email=args.contact_email,
@@ -25,6 +27,43 @@ def run_datacite(args):
end_date=args.end_date)
worker.run()
+def run_arxiv(args):
+ worker = HarvestArxivWorker(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic="fatcat-{}.arxiv".format(args.env),
+ state_topic="fatcat-{}.arxiv-state".format(args.env),
+ start_date=args.start_date,
+ end_date=args.end_date)
+ worker.run()
+
+def run_pubmed(args):
+ worker = HarvestPubmedWorker(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic="fatcat-{}.pubmed".format(args.env),
+ state_topic="fatcat-{}.pubmed-state".format(args.env),
+ start_date=args.start_date,
+ end_date=args.end_date)
+ worker.run()
+
+def run_doaj_article(args):
+ worker = HarvestDoajArticleWorker(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic="fatcat-{}.doaj-article".format(args.env),
+ state_topic="fatcat-{}.doaj-article-state".format(args.env),
+ start_date=args.start_date,
+ end_date=args.end_date)
+ worker.run()
+
+def run_doaj_journal(args):
+ worker = HarvestDoajJournalWorker(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic="fatcat-{}.doaj-journal".format(args.env),
+ state_topic="fatcat-{}.doaj-journal-state".format(args.env),
+ start_date=args.start_date,
+ end_date=args.end_date)
+ worker.run()
+
+
def mkdate(raw):
return datetime.datetime.strptime(raw, "%Y-%m-%d").date()
@@ -59,6 +98,18 @@ def main():
sub_datacite = subparsers.add_parser('datacite')
sub_datacite.set_defaults(func=run_datacite)
+ sub_arxiv = subparsers.add_parser('arxiv')
+ sub_arxiv.set_defaults(func=run_arxiv)
+
+ sub_pubmed = subparsers.add_parser('pubmed')
+ sub_pubmed.set_defaults(func=run_pubmed)
+
+ # DOAJ stuff disabled because API range-requests are broken
+ #sub_doaj_article = subparsers.add_parser('doaj-article')
+ #sub_doaj_article.set_defaults(func=run_doaj_article)
+ #sub_doaj_journal = subparsers.add_parser('doaj-journal')
+ #sub_doaj_journal.set_defaults(func=run_doaj_journal)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do!")
diff --git a/python/fatcat_tools/harvest/__init__.py b/python/fatcat_tools/harvest/__init__.py
index 4de2cbde..7d814696 100644
--- a/python/fatcat_tools/harvest/__init__.py
+++ b/python/fatcat_tools/harvest/__init__.py
@@ -1,3 +1,5 @@
from .harvest_common import HarvestState
from .doi_registrars import HarvestCrossrefWorker, HarvestDataciteWorker
+from .oaipmh import HarvestArxivWorker, HarvestPubmedWorker,\
+ HarvestDoajArticleWorker, HarvestDoajJournalWorker
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index d5e4b7ec..10492c17 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -10,15 +10,13 @@ import datetime
from pykafka import KafkaClient
from fatcat_tools.workers import most_recent_message
-from .harvest_common import HarvestState
+from .harvest_common import HarvestState, DATE_FMT
# Skip pylint due to:
# AttributeError: 'NoneType' object has no attribute 'scope'
# in 'astroid/node_classes.py'
# pylint: skip-file
-DATE_FMT = "%Y-%m-%d"
-
class HarvestCrossrefWorker:
"""
@@ -68,7 +66,6 @@ class HarvestCrossrefWorker:
self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks
self.api_batch_size = 50
- # for crossref, it's "from-index-date"
self.name = "Crossref"
def params(self, date_str):
@@ -86,6 +83,9 @@ class HarvestCrossrefWorker:
params['cursor'] = resp['message']['next-cursor']
return params
+ def extract_key(self, obj):
+ return obj['DOI'].encode('utf-8')
+
def fetch_date(self, date):
produce_topic = self.kafka.topics[self.produce_topic]
@@ -112,7 +112,7 @@ class HarvestCrossrefWorker:
self.extract_total(resp), http_resp.elapsed))
#print(json.dumps(resp))
for work in items:
- producer.produce(json.dumps(work).encode('utf-8'))
+ producer.produce(json.dumps(work).encode('utf-8'), partition_key=self.extract_key(work))
if len(items) < self.api_batch_size:
break
params = self.update_params(params, resp)
@@ -181,6 +181,9 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
def extract_total(self, resp):
return resp['meta']['total']
+ def extract_key(self, obj):
+ return obj['doi'].encode('utf-8')
+
def update_params(self, params, resp):
params['page[number]'] = resp['meta']['page'] + 1
return params
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
new file mode 100644
index 00000000..c3cb90db
--- /dev/null
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -0,0 +1,157 @@
+
+"""
+OAI-PMH protocol:
+ https://sickle.readthedocs.io/en/latest/
+
+Pubmed
+ https://www.ncbi.nlm.nih.gov/pmc/tools/oai/
+ https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm
+ https://github.com/titipata/pubmed_parser
+
+arxiv
+ some APIs work on a per-version basis, others do not
+
+ http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv
+ http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw
+
+doaj
+ https://github.com/miku/doajfetch
+
+-----
+
+actually, just going to re-use https://github.com/miku/metha for OAI-PMH stuff
+ => shell script from cronjob
+ => call metha-sync daily
+ => metha-cat -since <whenever> | kafkacat output
+ => echo "date" | kafkat state
+ => some shell trick (comm?) to find missing dates; for each, do metha-cat into kafka
+
+or, just skip kafka for this stuff for now? hrm.
+
+crossref-like stuff is far enough along to keep
+
+## More Miku Magic!
+
+wowa, JSTOR KBART files!
+ http://www.jstor.org/kbart/collections/all-archive-titles
+
+https://github.com/miku/ldjtab: faster than jq for just grabbing
+
+sort can be told how much memory to use; eg: `sort -S50%`, and threads to use
+
+"""
+
+import re
+import sys
+import csv
+import json
+import time
+import requests
+import itertools
+import datetime
+from pykafka import KafkaClient
+import sickle
+
+from fatcat_tools.workers import most_recent_message
+from .harvest_common import HarvestState, DATE_FMT
+
+
+class HarvestOaiPmhWorker:
+ """
+ Base class for OAI-PMH harvesters.
+
+ Based on Crossref importer
+ """
+
+
+ def __init__(self, kafka_hosts, produce_topic, state_topic,
+ start_date=None, end_date=None):
+
+ self.produce_topic = produce_topic
+ self.state_topic = state_topic
+ self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0")
+
+ self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks
+
+ self.endpoint_url = None # needs override
+ self.metadata_prefix = None # needs override
+ self.state = HarvestState(start_date, end_date)
+ self.state.initialize_from_kafka(self.kafka.topics[self.state_topic])
+
+
+ def fetch_date(self, date):
+
+ api = sickle.Sickle(self.endpoint_url)
+ date_str = date.strftime(DATE_FMT)
+ produce_topic = self.kafka.topics[self.produce_topic]
+ # this dict kwargs hack is to work around 'from' as a reserved python keyword
+ # recommended by sickle docs
+ records = api.ListRecords(**{
+ 'metadataPrefix': self.metadata_prefix,
+ 'from': date_str,
+ 'until': date_str,
+ })
+
+ count = 0
+ with produce_topic.get_producer() as producer:
+ for item in records:
+ count += 1
+ if count % 50 == 0:
+ print("... up to {}".format(count))
+ producer.produce(item.raw.encode('utf-8'), partition_key=item.header.identifier.encode('utf-8'))
+
+ def run(self, continuous=False):
+
+ while True:
+ current = self.state.next(continuous)
+ if current:
+ print("Fetching DOIs updated on {} (UTC)".format(current))
+ self.fetch_date(current)
+ self.state.complete(current, kafka_topic=self.kafka.topics[self.state_topic])
+ continue
+
+ if continuous:
+ print("Sleeping {} seconds...".format(self.loop_sleep))
+ time.sleep(self.loop_sleep())
+ else:
+ break
+ print("{} DOI ingest caught up".format(self.name))
+
+
+class HarvestArxivWorker(HarvestOaiPmhWorker):
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.endpoint_url = "https://export.arxiv.org/oai2"
+ self.metadata_prefix = "arXiv"
+
+
+class HarvestPubmedWorker(HarvestOaiPmhWorker):
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi"
+ self.metadata_prefix = "pmc_fm"
+
+
+class HarvestDoajJournalWorker(HarvestOaiPmhWorker):
+ """
+ WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.endpoint_url = "https://www.doaj.org/oai"
+ self.metadata_prefix = "oai_dc"
+
+
+class HarvestDoajArticleWorker(HarvestOaiPmhWorker):
+ """
+ WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.endpoint_url = "https://www.doaj.org/oai.article"
+ self.metadata_prefix = "oai_doaj"
+