diff options
Diffstat (limited to 'python')
41 files changed, 1966 insertions, 742 deletions
diff --git a/python/Pipfile b/python/Pipfile index eebdab36..b04bb91a 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -32,6 +32,7 @@ python-dateutil = "*" sickle = "*" python-snappy = "*" pymacaroons = "*" +ftfy= "*" [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/Pipfile.lock b/python/Pipfile.lock index 296079f0..f2d39a99 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "c3deb49cf4c122c2aed3f4f944e9763cfcf40c85891ca3d3e9cabc3debbb9075" + "sha256": "8f98bb3f6a3083c8b03cb68d1ee48b25449a950dd8a9d15189f2eb4fae48f760" }, "pipfile-spec": 6, "requires": { @@ -96,27 +96,27 @@ }, "cryptography": { "hashes": [ - "sha256:05a6052c6a9f17ff78ba78f8e6eb1d777d25db3b763343a1ae89a7a8670386dd", - "sha256:0eb83a24c650a36f68e31a6d0a70f7ad9c358fa2506dc7b683398b92e354a038", - "sha256:0ff4a3d6ea86aa0c9e06e92a9f986de7ee8231f36c4da1b31c61a7e692ef3378", - "sha256:1699f3e916981df32afdd014fb3164db28cdb61c757029f502cb0a8c29b2fdb3", - "sha256:1b1f136d74f411f587b07c076149c4436a169dc19532e587460d9ced24adcc13", - "sha256:21e63dd20f5e5455e8b34179ac43d95b3fb1ffa54d071fd2ed5d67da82cfe6dc", - "sha256:2454ada8209bbde97065453a6ca488884bbb263e623d35ba183821317a58b46f", - "sha256:3cdc5f7ca057b2214ce4569e01b0f368b3de9d8ee01887557755ccd1c15d9427", - "sha256:418e7a5ec02a7056d3a4f0c0e7ea81df374205f25f4720bb0e84189aa5fd2515", - "sha256:471a097076a7c4ab85561d7fa9a1239bd2ae1f9fd0047520f13d8b340bf3210b", - "sha256:5ecaf9e7db3ca582c6de6229525d35db8a4e59dc3e8a40a331674ed90e658cbf", - "sha256:63b064a074f8dc61be81449796e2c3f4e308b6eba04a241a5c9f2d05e882c681", - "sha256:6afe324dfe6074822ccd56d80420df750e19ac30a4e56c925746c735cf22ae8b", - "sha256:70596e90398574b77929cd87e1ac6e43edd0e29ba01e1365fed9c26bde295aa5", - "sha256:70c2b04e905d3f72e2ba12c58a590817128dfca08949173faa19a42c824efa0b", - "sha256:8908f1db90be48b060888e9c96a0dee9d842765ce9594ff6a23da61086116bb6", - "sha256:af12dfc9874ac27ebe57fc28c8df0e8afa11f2a1025566476b0d50cdb8884f70", - "sha256:b4fc04326b2d259ddd59ed8ea20405d2e695486ab4c5e1e49b025c484845206e", - "sha256:da5b5dda4aa0d5e2b758cc8dfc67f8d4212e88ea9caad5f61ba132f948bab859" - ], - "version": "==2.4.2" + "sha256:05b3ded5e88747d28ee3ef493f2b92cbb947c1e45cf98cfef22e6d38bb67d4af", + "sha256:06826e7f72d1770e186e9c90e76b4f84d90cdb917b47ff88d8dc59a7b10e2b1e", + "sha256:08b753df3672b7066e74376f42ce8fc4683e4fd1358d34c80f502e939ee944d2", + "sha256:2cd29bd1911782baaee890544c653bb03ec7d95ebeb144d714b0f5c33deb55c7", + "sha256:31e5637e9036d966824edaa91bf0aa39dc6f525a1c599f39fd5c50340264e079", + "sha256:42fad67d7072216a49e34f923d8cbda9edacbf6633b19a79655e88a1b4857063", + "sha256:4946b67235b9d2ea7d31307be9d5ad5959d6c4a8f98f900157b47abddf698401", + "sha256:522fdb2809603ee97a4d0ef2f8d617bc791eb483313ba307cb9c0a773e5e5695", + "sha256:6f841c7272645dd7c65b07b7108adfa8af0aaea57f27b7f59e01d41f75444c85", + "sha256:7d335e35306af5b9bc0560ca39f740dfc8def72749645e193dd35be11fb323b3", + "sha256:8504661ffe324837f5c4607347eeee4cf0fcad689163c6e9c8d3b18cf1f4a4ad", + "sha256:9260b201ce584d7825d900c88700aa0bd6b40d4ebac7b213857bd2babee9dbca", + "sha256:9a30384cc402eac099210ab9b8801b2ae21e591831253883decdb4513b77a3cd", + "sha256:9e29af877c29338f0cab5f049ccc8bd3ead289a557f144376c4fbc7d1b98914f", + "sha256:ab50da871bc109b2d9389259aac269dd1b7c7413ee02d06fe4e486ed26882159", + "sha256:b13c80b877e73bcb6f012813c6f4a9334fcf4b0e96681c5a15dac578f2eedfa0", + "sha256:bfe66b577a7118e05b04141f0f1ed0959552d45672aa7ecb3d91e319d846001e", + "sha256:e091bd424567efa4b9d94287a952597c05d22155a13716bf5f9f746b9dc906d3", + "sha256:fa2b38c8519c5a3aa6e2b4e1cf1a549b54acda6adb25397ff542068e73d1ed00" + ], + "version": "==2.5" }, "fatcat-client": { "editable": true, @@ -152,6 +152,14 @@ "index": "pypi", "version": "==0.2" }, + "ftfy": { + "hashes": [ + "sha256:84a1614190173bb447ac9d581e50185c6aa35b538754b6bedaba0cc0f83d8e80", + "sha256:fa74757fb7cb444366fa6a79c2feabd40281a44dfbf6eaed492a804764ee26b2" + ], + "index": "pypi", + "version": "==5.5.1" + }, "idna": { "hashes": [ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", @@ -366,6 +374,13 @@ ], "version": "==1.24.1" }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + }, "werkzeug": { "hashes": [ "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", @@ -558,10 +573,10 @@ }, "parso": { "hashes": [ - "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", - "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" + "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d", + "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31" ], - "version": "==0.3.1" + "version": "==0.3.2" }, "pathlib2": { "hashes": [ @@ -595,10 +610,10 @@ }, "pluggy": { "hashes": [ - "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095", - "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f" + "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616", + "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a" ], - "version": "==0.8.0" + "version": "==0.8.1" }, "prompt-toolkit": { "hashes": [ @@ -610,38 +625,38 @@ }, "psycopg2": { "hashes": [ - "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e", - "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237", - "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f", - "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300", - "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076", - "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221", - "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82", - "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92", - "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3", - "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6", - "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e", - "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae", - "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535", - "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f", - "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26", - "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7", - "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53", - "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b", - "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc", - "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee", - "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20", - "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d", - "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9", - "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244", - "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f", - "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b", - "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370", - "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9", - "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d", - "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0" - ], - "version": "==2.7.6.1" + "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94", + "sha256:0e9873e60f98f0c52339abf8f0339d1e22bfe5aae0bcf7aabd40c055175035ec", + "sha256:1148a5eb29073280bf9057c7fc45468592c1bb75a28f6df1591adb93c8cb63d0", + "sha256:259a8324e109d4922b0fcd046e223e289830e2568d6f4132a3702439e5fd532b", + "sha256:28dffa9ed4595429e61bacac41d3f9671bb613d1442ff43bcbec63d4f73ed5e8", + "sha256:314a74302d4737a3865d40ea50e430ce1543c921ba10f39d562e807cfe2edf2a", + "sha256:36b60201b6d215d7658a71493fdf6bd5e60ad9a0cffed39906627ff9f4f3afd3", + "sha256:3f9d532bce54c4234161176ff3b8688ff337575ca441ea27597e112dfcd0ee0c", + "sha256:5d222983847b40af989ad96c07fc3f07e47925e463baa5de716be8f805b41d9b", + "sha256:6757a6d2fc58f7d8f5d471ad180a0bd7b4dd3c7d681f051504fbea7ae29c8d6f", + "sha256:6a0e0f1e74edb0ab57d89680e59e7bfefad2bfbdf7c80eb38304d897d43674bb", + "sha256:6ca703ccdf734e886a1cf53eb702261110f6a8b0ed74bcad15f1399f74d3f189", + "sha256:8513b953d8f443c446aa79a4cc8a898bd415fc5e29349054f03a7d696d495542", + "sha256:9262a5ce2038570cb81b4d6413720484cb1bc52c064b2f36228d735b1f98b794", + "sha256:97441f851d862a0c844d981cbee7ee62566c322ebb3d68f86d66aa99d483985b", + "sha256:a07feade155eb8e69b54dd6774cf6acf2d936660c61d8123b8b6b1f9247b67d6", + "sha256:a9b9c02c91b1e3ec1f1886b2d0a90a0ea07cc529cb7e6e472b556bc20ce658f3", + "sha256:ae88216f94728d691b945983140bf40d51a1ff6c7fe57def93949bf9339ed54a", + "sha256:b360ffd17659491f1a6ad7c928350e229c7b7bd83a2b922b6ee541245c7a776f", + "sha256:b4221957ceccf14b2abdabef42d806e791350be10e21b260d7c9ce49012cc19e", + "sha256:b90758e49d5e6b152a460d10b92f8a6ccf318fcc0ee814dcf53f3a6fc5328789", + "sha256:c669ea986190ed05fb289d0c100cc88064351f2b85177cbfd3564c4f4847d18c", + "sha256:d1b61999d15c79cf7f4f7cc9021477aef35277fc52452cf50fd13b713c84424d", + "sha256:de7bb043d1adaaf46e38d47e7a5f703bb3dab01376111e522b07d25e1a79c1e1", + "sha256:e393568e288d884b94d263f2669215197840d097c7e5b0acd1a51c1ea7d1aba8", + "sha256:ed7e0849337bd37d89f2c2b0216a0de863399ee5d363d31b1e5330a99044737b", + "sha256:f153f71c3164665d269a5d03c7fa76ba675c7a8de9dc09a4e2c2cdc9936a7b41", + "sha256:f1fb5a8427af099beb7f65093cbdb52e021b8e6dbdfaf020402a623f4181baf5", + "sha256:f36b333e9f86a2fba960c72b90c34be6ca71819e300f7b1fc3d2b0f0b2c546cd", + "sha256:f4526d078aedd5187d0508aa5f9a01eae6a48a470ed678406da94b4cd6524b7e" + ], + "version": "==2.7.7" }, "ptyprocess": { "hashes": [ @@ -674,11 +689,11 @@ }, "pytest": { "hashes": [ - "sha256:3e65a22eb0d4f1bdbc1eacccf4a3198bf8d4049dea5112d70a0c61b00e748d02", - "sha256:5924060b374f62608a078494b909d341720a050b5224ff87e17e12377486a71d" + "sha256:41568ea7ecb4a68d7f63837cf65b92ce8d0105e43196ff2b26622995bb3dc4b2", + "sha256:c3c573a29d7c9547fb90217ece8a8843aa0c1328a797e200290dc3d0b4b823be" ], "index": "pypi", - "version": "==4.1.0" + "version": "==4.1.1" }, "pytest-cov": { "hashes": [ @@ -727,30 +742,30 @@ }, "typed-ast": { "hashes": [ - "sha256:0555eca1671ebe09eb5f2176723826f6f44cca5060502fea259de9b0e893ab53", - "sha256:0ca96128ea66163aea13911c9b4b661cb345eb729a20be15c034271360fc7474", - "sha256:16ccd06d614cf81b96de42a37679af12526ea25a208bce3da2d9226f44563868", - "sha256:1e21ae7b49a3f744958ffad1737dfbdb43e1137503ccc59f4e32c4ac33b0bd1c", - "sha256:37670c6fd857b5eb68aa5d193e14098354783b5138de482afa401cc2644f5a7f", - "sha256:46d84c8e3806619ece595aaf4f37743083f9454c9ea68a517f1daa05126daf1d", - "sha256:5b972bbb3819ece283a67358103cc6671da3646397b06e7acea558444daf54b2", - "sha256:6306ffa64922a7b58ee2e8d6f207813460ca5a90213b4a400c2e730375049246", - "sha256:6cb25dc95078931ecbd6cbcc4178d1b8ae8f2b513ae9c3bd0b7f81c2191db4c6", - "sha256:7e19d439fee23620dea6468d85bfe529b873dace39b7e5b0c82c7099681f8a22", - "sha256:7f5cd83af6b3ca9757e1127d852f497d11c7b09b4716c355acfbebf783d028da", - "sha256:81e885a713e06faeef37223a5b1167615db87f947ecc73f815b9d1bbd6b585be", - "sha256:94af325c9fe354019a29f9016277c547ad5d8a2d98a02806f27a7436b2da6735", - "sha256:b1e5445c6075f509d5764b84ce641a1535748801253b97f3b7ea9d948a22853a", - "sha256:cb061a959fec9a514d243831c514b51ccb940b58a5ce572a4e209810f2507dcf", - "sha256:cc8d0b703d573cbabe0d51c9d68ab68df42a81409e4ed6af45a04a95484b96a5", - "sha256:da0afa955865920edb146926455ec49da20965389982f91e926389666f5cf86a", - "sha256:dc76738331d61818ce0b90647aedde17bbba3d3f9e969d83c1d9087b4f978862", - "sha256:e7ec9a1445d27dbd0446568035f7106fa899a36f55e52ade28020f7b3845180d", - "sha256:f741ba03feb480061ab91a465d1a3ed2d40b52822ada5b4017770dfcb88f839f", - "sha256:fe800a58547dd424cd286b7270b967b5b3316b993d86453ede184a17b5a6b17d" + "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe", + "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c", + "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2", + "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a", + "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7", + "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827", + "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33", + "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9", + "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032", + "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9", + "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2", + "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2", + "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062", + "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15", + "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357", + "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a", + "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824", + "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442", + "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1", + "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2", + "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6" ], "markers": "python_version < '3.7' and implementation_name == 'cpython'", - "version": "==1.1.1" + "version": "==1.2.0" }, "urllib3": { "hashes": [ @@ -768,9 +783,9 @@ }, "wrapt": { "hashes": [ - "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" + "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533" ], - "version": "==1.10.11" + "version": "==1.11.1" } } } diff --git a/python/README_import.md b/python/README_import.md index cc9a94e1..2465940b 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -26,11 +26,13 @@ the others: wget https://archive.org/download/ia_papers_manifest_2018-01-25/index/idents_files_urls.sqlite.gz wget https://archive.org/download/ia_journal_metadata_explore_2018-04-05/journal_extra_metadata.csv wget https://archive.org/download/issn_issnl_mappings/20180216.ISSN-to-ISSN-L.txt - wget https://archive.org/download/orcid-dump-2017/public_profiles_API-2.0_2017_10_json.tar.gz + wget https://archive.org/download/orcid-dump-2017/public_profiles_1_2_json.all.json.gz wget https://archive.org/download/ia_journal_pid_map_munge_20180908/release_ids.ia_munge_20180908.sqlite3.gz wget https://archive.org/download/ia_test_paper_matches/2018-08-27-2352.17-matchcrossref.insertable.json.gz wget https://archive.org/download/ia_papers_manifest_2018-01-25_matched/ia_papers_manifest_2018-01-25.matched.json.gz + gunzip public_profiles_1_2_json.all.json.gz + ## ISSN From CSV file: @@ -54,13 +56,14 @@ Usually 24 hours or so on fast production machine. ## Matched -Unknown speed! +These each take 2-4 hours: # No file update for the first import... - zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates - + time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates - # ... but do on the second zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - # GROBID extracted (release+file) time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata - + diff --git a/python/env.example b/python/env.example index c986b9d2..75fc5238 100644 --- a/python/env.example +++ b/python/env.example @@ -1,4 +1,5 @@ -FLASK_SECRET_KEY="" +FLASK_SECRET_KEY="TODO-REPLACE-ME" +FATCAT_DOMAIN="dev.fatcat.wiki" # This key used in tests FATCAT_API_AUTH_TOKEN="AgEPZGV2LmZhdGNhdC53aWtpAhYyMDE5MDEwMS1kZXYtZHVtbXkta2V5AAImZWRpdG9yX2lkID0gYWFhYWFhYWFhYWFhYmt2a2FhYWFhYWFhYWkAAht0aW1lID4gMjAxOS0wMS0wOVQwMDo1Nzo1MloAAAYgnroNha1hSftChtxHGTnLEmM/pY8MeQS/jBSV0UNvXug=" FATCAT_API_HOST="http://localhost:9411/v0" @@ -14,6 +15,5 @@ SENTRY_DSN="" # FATCAT_API_AUTH_TOKEN FATCAT_AUTH_WORKER_CROSSREF="" FATCAT_AUTH_WORKER_ORCID="" -FATCAT_AUTH_WORKER_ISSN="" -FATCAT_AUTH_WORKER_MATCHED="" -FATCAT_AUTH_WORKER_GROBID_METADATA="" +FATCAT_AUTH_WORKER_PUBMED="" +FATCAT_AUTH_WORKER_DATACITE="" diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 0e176b2c..a47aa175 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -1,47 +1,40 @@ #!/usr/bin/env python3 -""" -""" - import os, sys, argparse from fatcat_tools import authenticated_api -from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \ - IssnImporter, MatchedImporter, GrobidMetadataImporter, make_kafka_consumer +from fatcat_tools.importers import * def run_crossref(args): - fci = CrossrefImporter(args.api, args.issn_map_file, + fci = CrossrefImporter(args.api, + args.issn_map_file, extid_map_file=args.extid_map_file, - create_containers=(not args.no_create_containers), - check_existing=(not args.no_release_updates)) + edit_batch_size=args.batch_size, + bezerk_mode=args.bezerk_mode) if args.kafka_mode: - consumer = make_kafka_consumer( - args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import") - fci.process_batch(consumer, size=args.batch_size, decode_kafka=True) + KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import").run() else: - fci.process_batch(args.json_file, size=args.batch_size) - fci.describe_run() + JsonLinePusher(fci).run() def run_orcid(args): - foi = OrcidImporter(args.api) - foi.process_batch(args.json_file, size=args.batch_size) - foi.describe_run() + foi = OrcidImporter(args.api, + edit_batch_size=args.batch_size) + JsonLinePusher(foi, args.json_file).run() -def run_issn(args): - fii = IssnImporter(args.api) - fii.process_csv_batch(args.csv_file, size=args.batch_size) - fii.describe_run() +def run_journal_metadata(args): + fii = JournalMetadataImporter(args.api, + edit_batch_size=args.batch_size) + CsvLinePusher(fii, args.csv_file).run() def run_matched(args): fmi = MatchedImporter(args.api, - skip_file_updates=args.no_file_updates) - fmi.process_batch(args.json_file, size=args.batch_size) - fmi.describe_run() + bezerk_mode=args.bezerk_mode, + edit_batch_size=args.batch_size) + JsonLinePusher(fmi, args.json_file).run() def run_grobid_metadata(args): - fmi = GrobidMetadataImporter(args.api) - fmi.process_source(args.tsv_file, group_size=args.group_size) - fmi.describe_run() + fmi = GrobidMetadataImporter(args.api, edit_batch_size=args.batch_size, longtail_oa=args.longtail_oa) + LinePusher(fmi, args.tsv_file).run() def main(): parser = argparse.ArgumentParser() @@ -73,18 +66,15 @@ def main(): sub_crossref.add_argument('--extid-map-file', help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) - sub_crossref.add_argument('--no-create-containers', - action='store_true', - help="skip creation of new container entities based on ISSN") sub_crossref.add_argument('--batch-size', help="size of batch to send", default=50, type=int) sub_crossref.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") - sub_crossref.add_argument('--no-release-updates', + sub_crossref.add_argument('--bezerk-mode', action='store_true', - help="don't lookup existing DOIs, just insert (only for bootstrap)") + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") sub_orcid = subparsers.add_parser('orcid') sub_orcid.set_defaults( @@ -98,37 +88,37 @@ def main(): help="size of batch to send", default=50, type=int) - sub_issn = subparsers.add_parser('issn') - sub_issn.set_defaults( - func=run_issn, - auth_var="FATCAT_AUTH_WORKER_ISSN", + sub_journal_metadata = subparsers.add_parser('journal-metadata') + sub_journal_metadata.set_defaults( + func=run_journal_metadata, + auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) - sub_issn.add_argument('csv_file', + sub_journal_metadata.add_argument('csv_file', help="Journal ISSN CSV metadata file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_issn.add_argument('--batch-size', + sub_journal_metadata.add_argument('--batch-size', help="size of batch to send", default=50, type=int) sub_matched = subparsers.add_parser('matched') sub_matched.set_defaults( func=run_matched, - auth_var="FATCAT_AUTH_WORKER_MATCHED", + auth_var="FATCAT_API_AUTH_TOKEN", ) sub_matched.add_argument('json_file', help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_matched.add_argument('--no-file-updates', - action='store_true', - help="don't lookup existing files, just insert (only for bootstrap)") sub_matched.add_argument('--batch-size', help="size of batch to send", default=50, type=int) + sub_matched.add_argument('--bezerk-mode', + action='store_true', + help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") sub_grobid_metadata = subparsers.add_parser('grobid-metadata') sub_grobid_metadata.set_defaults( func=run_grobid_metadata, - auth_var="FATCAT_AUTH_WORKER_GROBID_METADATA", + auth_var="FATCAT_API_AUTH_TOKEN", ) sub_grobid_metadata.add_argument('tsv_file', help="TSV file to import from (or stdin)", @@ -136,6 +126,9 @@ def main(): sub_grobid_metadata.add_argument('--group-size', help="editgroup group size to use", default=75, type=int) + sub_matched.add_argument('--longtail-oa', + action='store_true', + help="if this is an import of longtail OA content (sets an 'extra' flag)") args = parser.parse_args() if not args.__dict__.get("func"): @@ -144,6 +137,7 @@ def main(): args.api = authenticated_api( args.host_url, + # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var)) args.func(args) diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index e6f081e5..70f38f5b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -1,7 +1,22 @@ -from .common import FatcatImporter, make_kafka_consumer +""" +To run an import you combine two classes; one each of: + +- RecordSource: somehow iterates over a source of raw records (eg, from a + database, Kafka, files on disk, stdin) and pushes into an entity importer. +- EntityImporter: class that a record iterator pushes raw (unparsed) records + into. The entity importer parses and decides what to do (ignore, update, + insert, etc). There is usually a primary entity type, though related entities + can be created along the way. Maintains API connection and editgroup/batch + state. + +""" + +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .grobid_metadata import GrobidMetadataImporter -from .issn import IssnImporter +from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter from .orcid import OrcidImporter +#from .kafka_source import KafkaSource +#from .file_source import FileSource diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 06897bee..89203a4f 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re import sys import csv import json +import ftfy import itertools import subprocess from collections import Counter @@ -12,30 +13,66 @@ import fatcat_client from fatcat_client.rest import ApiException -# from: https://docs.python.org/3/library/itertools.html -def grouper(iterable, n, fillvalue=None): - "Collect data into fixed-length chunks or blocks" - args = [iter(iterable)] * n - return itertools.zip_longest(*args, fillvalue=fillvalue) +def clean(thing, force_xml=False): + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. -def make_kafka_consumer(hosts, env, topic_suffix, group): - topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8') - client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0") - consume_topic = client.topics[topic_name] - print("Consuming from kafka topic {}, group {}".format(topic_name, group)) + It will try to clean up commong unicode mangles, HTML characters, etc. - consumer = consume_topic.get_balanced_consumer( - consumer_group=group.encode('utf-8'), - managed=True, - auto_commit_enable=True, - auto_commit_interval_ms=30000, # 30 seconds - compacted_topic=True, - ) - return consumer + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return thing + fix_entities = 'auto' + if force_xml: + fix_entities = True + fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip() + if not fixed: + # wasn't zero-length before, but is now; return None + return None + return fixed + +def test_clean(): -class FatcatImporter: + assert clean(None) == None + assert clean('') == '' + assert clean('123') == '123' + assert clean('a&b') == 'a&b' + assert clean('<b>a&b</b>') == '<b>a&b</b>' + assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + +class EntityImporter: """ - Base class for fatcat importers + Base class for fatcat entity importers. + + The API exposed to record iterator is: + + push_record(raw_record) + finish() + + The API that implementations are expected to fill in are: + + want(raw_record) -> boolean + parse(raw_record) -> entity + try_update(entity) -> boolean + insert_batch([entity]) -> None + + This class exposes helpers for implementations: + + self.api + self.create_<entity>(entity) -> EntityEdit + for related entity types + self.push_entity(entity) + self.counts['exists'] += 1 + if didn't update or insert because of existing) + self.counts['update'] += 1 + if updated an entity """ def __init__(self, api, **kwargs): @@ -43,87 +80,135 @@ class FatcatImporter: eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['git_rev'] = eg_extra.get('git_rev', subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter') + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') self.api = api - self._editgroup_description = kwargs.get('editgroup_description') - self._editgroup_extra = kwargs.get('editgroup_extra') - issn_map_file = kwargs.get('issn_map_file') + self.bezerk_mode = kwargs.get('bezerk_mode', False) + self.edit_batch_size = kwargs.get('edit_batch_size', 100) + self.editgroup_description = kwargs.get('editgroup_description') + self.editgroup_extra = kwargs.get('editgroup_extra') + self.reset() self._issnl_id_map = dict() self._orcid_id_map = dict() - self._doi_id_map = dict() - if issn_map_file: - self.read_issn_map_file(issn_map_file) self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") - self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0}) + self._doi_id_map = dict() - def _editgroup(self): - eg = fatcat_client.Editgroup( - description=self._editgroup_description, - extra=self._editgroup_extra, - ) - return self.api.create_editgroup(eg) + def reset(self): + self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + self._edit_count = 0 + self._editgroup_id = None + self._entity_queue = [] - def describe_run(self): - print("Processed {} lines, inserted {}, updated {}.".format( - self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) + def push_record(self, raw_record): + """ + Returns nothing. + """ + if (not raw_record) or (not self.want(raw_record)): + self.counts['skip'] += 1 + return + entity = self.parse_record(raw_record) + if not entity: + self.counts['skip'] += 1 + return + if self.bezerk_mode: + self.push_entity(entity) + return + if self.try_update(entity): + self.push_entity(entity) + return - def create_row(self, row, editgroup_id=None): - # sub-classes expected to implement this - raise NotImplementedError + def finish(self): + if self._edit_count > 0: + self.api.accept_editgroup(self._editgroup_id) + self._editgroup_id = None + self._edit_count = 0 + + if self._entity_queue: + self.insert_batch(self._entity_queue) + self.counts['insert'] += len(self._entity_queue) + self._entity_queue = [] + + self.counts['total'] = 0 + for key in ('skip', 'insert', 'update', 'exists'): + self.counts['total'] += self.counts[key] + return self.counts + + def _get_editgroup(self, edits=1): + if self._edit_count >= self.edit_batch_size: + self.api.accept_editgroup(self._editgroup_id) + self._editgroup_id = None + self._edit_count = 0 - def create_batch(self, rows, editgroup_id=None): - # sub-classes expected to implement this + if not self._editgroup_id: + eg = self.api.create_editgroup( + fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra)) + self._editgroup_id = eg.editgroup_id + + self._edit_count += edits + return self._editgroup_id + + def create_container(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.container'] += 1 + return self.api.create_container(entity, editgroup_id=eg_id) + + def create_release(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.release'] += 1 + return self.api.create_release(entity, editgroup_id=eg_id) + + def create_file(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.file'] += 1 + return self.api.create_file(entity, editgroup_id=eg_id) + + def updated(self): + """ + Implementations should call this from try_update() if the update was successful + """ + self.counts['update'] += 1 + + def push_entity(self, entity): + self._entity_queue.append(entity) + if len(self._entity_queue) >= self.edit_batch_size: + self.insert_batch(self._entity_queue) + self.counts['insert'] += len(_entity_queue) + self._entity_queue = 0 + + def want(self, raw_record): + """ + Implementations can override for optional fast-path to drop a record. + Must have no side-effects; returns bool. + """ + return True + + def parse(self, raw_record): + """ + Returns an entity class type, or None if we should skip this one. + + May have side-effects (eg, create related entities), but shouldn't + update/mutate the actual entity. + """ raise NotImplementedError - def process_source(self, source, group_size=100): - """Creates and auto-accepts editgroup every group_size rows""" - eg = self._editgroup() - i = 0 - for i, row in enumerate(source): - self.create_row(row, editgroup_id=eg.editgroup_id) - if i > 0 and (i % group_size) == 0: - self.api.accept_editgroup(eg.editgroup_id) - eg = self._editgroup() - self.counts['processed_lines'] += 1 - if i == 0 or (i % group_size) != 0: - self.api.accept_editgroup(eg.editgroup_id) - - def process_batch(self, source, size=50, decode_kafka=False): - """Reads and processes in batches (not API-call-per-)""" - for rows in grouper(source, size): - if decode_kafka: - rows = [msg.value.decode('utf-8') for msg in rows] - self.counts['processed_lines'] += len(rows) - #eg = self._editgroup() - #self.create_batch(rows, editgroup_id=eg.editgroup_id) - self.create_batch(rows) - - def process_csv_source(self, source, group_size=100, delimiter=','): - reader = csv.DictReader(source, delimiter=delimiter) - self.process_source(reader, group_size) - - def process_csv_batch(self, source, size=50, delimiter=','): - reader = csv.DictReader(source, delimiter=delimiter) - self.process_batch(reader, size) + def try_update(self, raw_record): + """ + Passed the output of parse(). Should try to find an existing entity and + update it (PUT), decide we should do nothing (based on the existing + record), or create a new one. - def is_issnl(self, issnl): - return len(issnl) == 9 and issnl[4] == '-' + Implementations must update the exists/updated/skip counts + appropriately in this method. - def lookup_issnl(self, issnl): - """Caches calls to the ISSN-L lookup API endpoint in a local dict""" - if issnl in self._issnl_id_map: - return self._issnl_id_map[issnl] - container_id = None - try: - rv = self.api.lookup_container(issnl=issnl) - container_id = rv.ident - except ApiException as ae: - # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 - self._issnl_id_map[issnl] = container_id # might be None - return container_id + Returns boolean: True if the entity should still be inserted, False otherwise + """ + raise NotImplementedError + + def insert_batch(self, raw_record): + raise NotImplementedError def is_orcid(self, orcid): return self._orcid_regex.match(orcid) is not None @@ -163,6 +248,23 @@ class FatcatImporter: self._doi_id_map[doi] = release_id # might be None return release_id + def is_issnl(self, issnl): + return len(issnl) == 9 and issnl[4] == '-' + + def lookup_issnl(self, issnl): + """Caches calls to the ISSN-L lookup API endpoint in a local dict""" + if issnl in self._issnl_id_map: + return self._issnl_id_map[issnl] + container_id = None + try: + rv = self.api.lookup_container(issnl=issnl) + container_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._issnl_id_map[issnl] = container_id # might be None + return container_id + def read_issn_map_file(self, issn_map_file): print("Loading ISSN map file...") self._issn_issnl_map = dict() @@ -179,3 +281,117 @@ class FatcatImporter: if issn is None: return None return self._issn_issnl_map.get(issn) + + +class RecordPusher: + """ + Base class for different importer sources. Pretty trivial interface, just + wraps an importer and pushes records in to it. + """ + + def __init__(self, importer, **kwargs): + self.importer = importer + + def run(self): + """ + This will look something like: + + for line in sys.stdin: + record = json.loads(line) + self.importer.push_record(record) + print(self.importer.finish()) + """ + raise NotImplementedError + + +class JsonLinePusher(RecordPusher): + + def __init__(self, importer, json_file, **kwargs): + self.importer = importer + self.json_file = json_file + + def run(self): + for line in self.json_file: + if not line: + continue + record = json.loads(line) + self.importer.push_record(record) + counts = self.importer.finish() + print(counts) + return counts + + +class CsvPusher(RecordPusher): + + def __init__(self, importer, csv_file, **kwargs): + self.importer = importer + self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ',')) + + def run(self): + for line in self.reader: + if not line: + continue + self.importer.push_record(line) + counts = self.importer.finish() + print(counts) + return counts + + +class LinePusher(RecordPusher): + + def __init__(self, importer, text_file, **kwargs): + self.importer = importer + self.text_file = text_file + + def run(self): + for line in self.text_file: + if not line: + continue + self.importer.push_record(line) + counts = self.importer.finish() + print(counts) + return counts + + +class KafkaJsonPusher(RecordPusher): + + def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): + self.importer = importer + self.consumer = make_kafka_consumer( + kafka_hosts, + kafka_env, + topic_suffix, + group, + ) + + def run(self): + count = 0 + for msg in self.consumer: + if not msg: + continue + record = json.loads(msg.value.decode('utf-8')) + self.importer.push_record(record) + count += 1 + if count % 500 == 0: + print("Import counts: {}".format(self.importer.counts)) + # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or + # commit the current batch if it has been lingering + counts = self.importer.finish() + print(counts) + return counts + + +def make_kafka_consumer(hosts, env, topic_suffix, group): + topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8') + client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0") + consume_topic = client.topics[topic_name] + print("Consuming from kafka topic {}, group {}".format(topic_name, group)) + + consumer = consume_topic.get_balanced_consumer( + consumer_group=group.encode('utf-8'), + managed=True, + auto_commit_enable=True, + auto_commit_interval_ms=30000, # 30 seconds + compacted_topic=True, + ) + return consumer diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 6365e491..00c719f1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -6,7 +6,7 @@ import datetime import itertools import subprocess import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean # The docs/guide should be the cannonical home for these mappings; update there @@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = { 'standard': 'standard', } -class CrossrefImporter(FatcatImporter): +CONTAINER_TYPE_MAP = { + 'article-journal': 'journal', + 'paper-conference': 'conference', + 'book': 'book-series', +} + +# TODO: +LICENSE_SLUG_MAP = { + "http://creativecommons.org/licenses/by/3.0/": "CC-BY", + "http://creativecommons.org/licenses/by/4.0/": "CC-BY", + "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", + "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", + # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # http://www.springer.com/tdm doesn't seem like a license +} + +class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. @@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter): issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra) + + self.create_containers = kwargs.get('create_containers') extid_map_file = kwargs.get('extid_map_file') - create_containers = kwargs.get('create_containers') - check_existing = kwargs.get('check_existing') self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter): self.extid_map_db = sqlite3.connect(db_uri, uri=True) else: print("Not using external ID map") - self.create_containers = create_containers - self.check_existing = check_existing + + self.read_issn_map_file(issn_map_file) def lookup_ext_ids(self, doi): if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], pmid=row[1], pmcid=row[2], - wikidata_qid=row[3]) + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) def map_release_type(self, crossref_type): return CROSSREF_TYPE_MAP.get(crossref_type) - def parse_crossref_dict(self, obj): + def map_container_type(self, crossref_type): + return CONTAINER_TYPE_MAP.get(crossref_type) + + def want(self, obj): + if not obj.get('title'): + return False + + # do most of these checks in-line below + return True + + def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a ReleaseEntity """ - # Do require the 'title' keys to exsit, as release entities do - if (not 'title' in obj) or (not obj['title']): - return None - # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now if obj.get('type') in (None, 'journal', 'proceedings', @@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter): 'book-track', 'proceedings-series'): return None - # lookup existing DOI - existing_release = None - if self.check_existing: - try: - existing_release = self.api.lookup_release(doi=obj['DOI'].lower()) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - - # eventually we'll want to support "updates", but for now just skip if - # entity already exists - if existing_release: + # Do require the 'title' keys to exsit, as release entities do + if (not 'title' in obj) or (not obj['title']): return None + release_type = self.map_release_type(obj['type']) + # contribs def do_contribs(obj_list, ctype): contribs = [] @@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter): index = i else: index = None + raw_affiliation = None if am.get('affiliation'): - # note: affiliation => affiliations - extra['affiliations'] = am.get('affiliation') + if len(am.get('affiliation')) > 0: + raw_affiliation = am.get('affiliation')[0]['name'] + if len(am.get('affiliation')) > 1: + # note: affiliation => more_affiliations + extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] if am.get('sequence') and am.get('sequence') != "additional": - extra['sequence'] = am.get('sequence') + extra['seq'] = clean(am.get('sequence')) if not extra: extra = None assert ctype in ("author", "editor", "translator") contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, - raw_name=raw_name, + raw_name=clean(raw_name), + raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs @@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter): container_id = self.lookup_issnl(issnl) publisher = obj.get('publisher') - ce = None if (container_id is None and self.create_containers and (issnl is not None) and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, - publisher=publisher, - name=obj['container-title'][0]) + publisher=clean(publisher), + container_type=self.map_container_type(release_type), + name=clean(obj['container-title'][0], force_xml=True)) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + + # license slug + license_slug = None + license_extra = [] + for l in obj.get('license', []): + if l['content-version'] not in ('vor', 'unspecified'): + continue + slug = LICENSE_SLUG_MAP.get(l['URL']) + if slug: + license_slug = slug + if 'start' in l: + l['start'] = l['start']['date-time'] + license_extra.append(l) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): try: year = int(rm.get('year')) - # NOTE: will need to update/config in the future! + # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year > 2025 or year < 100: year = None except: year = None - extra = rm.copy() - if rm.get('DOI'): - extra['doi'] = rm.get('DOI').lower() key = rm.get('key') if key and key.startswith(obj['DOI'].upper()): key = key.replace(obj['DOI'].upper() + "-", '') @@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter): container_name = rm.get('volume-title') if not container_name: container_name = rm.get('journal-title') - extra.pop('DOI', None) - extra.pop('key', None) - extra.pop('year', None) - extra.pop('volume-name', None) - extra.pop('journal-title', None) - extra.pop('title', None) - extra.pop('first-page', None) - extra.pop('doi-asserted-by', None) + elif rm.get('journal-title'): + extra['journal-title'] = rm['journal-title'] + extra = dict() + if rm.get('DOI'): + extra['doi'] = rm.get('DOI').lower() + # TODO: what fields here? CSL citation stuff + for k in ('author', 'editor', 'edition', 'authority', 'version', + 'genre', 'url', 'event', 'issue', 'volume', 'date', + 'accessed_date', 'issued', 'page', 'medium', + 'collection_title', 'chapter_number'): + if clean(rm.get(k)): + extra[k] = clean(rm[k]) if extra: extra = dict(crossref=extra) else: @@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter): target_release_id=None, key=key, year=year, - container_name=container_name, - title=rm.get('title'), - locator=rm.get('first-page'), + container_name=clean(container_name), + title=clean(rm.get('title')), + locator=clean(rm.get('first-page')), # TODO: just dump JSON somewhere here? extra=extra)) @@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter): if obj.get('abstract') != None: abstracts.append(fatcat_client.ReleaseEntityAbstracts( mimetype="application/xml+jats", - content=obj.get('abstract'))) + content=clean(obj.get('abstract')))) # extra fields extra = dict() - for key in ('subject', 'type', 'license', 'alternative-id', - 'container-title', 'original-title', 'subtitle', 'archive', - 'funder', 'group-title'): - # TODO: unpack "container-title" array + for key in ('subject', 'type', 'alternative-id', 'container-title', + 'subtitle', 'archive', 'funder', 'group-title'): + # TODO: unpack "container-title" array? val = obj.get(key) if val: - extra[key] = val - if 'license' in extra and extra['license']: - for i in range(len(extra['license'])): - if 'start' in extra['license'][i]: - extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] + if type(val) == str: + extra[key] = clean(val) + else: + extra[key] = val + if license_extra: + extra['license'] = license_extra + if len(obj['title']) > 1: - extra['other-titles'] = obj['title'][1:] - # TODO: this should be top-level - extra['is_kept'] = len(obj.get('archive', [])) > 0 + extra['other-titles'] = [clean(t) for t in obj['title'][1:]] # ISBN isbn13 = None @@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter): re = fatcat_client.ReleaseEntity( work_id=None, - title=obj.get('title', [None])[0], - contribs=contribs, - refs=refs, container_id=container_id, - publisher=publisher, - release_type=self.map_release_type(obj['type']), + title=clean(obj.get('title', [None])[0], force_xml=True), + original_title=clean(obj.get('original-title', [None])[0]), + release_type=release_type, release_status=release_status, + release_date=release_date, + release_year=release_year, + publisher=clean(publisher), doi=obj['DOI'].lower(), - isbn13=isbn13, - core_id=extids['core_id'], pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], - release_date=release_date, - release_year=release_year, - issue=obj.get('issue'), - volume=obj.get('volume'), - pages=obj.get('page'), + isbn13=isbn13, + core_id=extids['core_id'], + arxiv_id=extids['arxiv_id'], + jstor_id=extids['jstor_id'], + volume=clean(obj.get('volume')), + issue=clean(obj.get('issue')), + pages=clean(obj.get('page')), + language=None, # crossref doesn't supply language info + license_slug=license_slug, + extra=dict(crossref=extra), abstracts=abstracts, - extra=dict(crossref=extra)) - return (re, ce) + contribs=contribs, + refs=refs, + ) + return re + + def try_update(self, re): + + # lookup existing DOI (don't need to try other ext idents for crossref) + existing = None + try: + existing = self.api.lookup_release(doi=re.doi) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_release_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) - def create_row(self, row, editgroup_id=None): - if row is None: - return - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - container = self.api.create_container(ce, editgroup_id=editgroup_id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - self.api.create_release(re, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - def create_batch(self, batch): - """Current work/release pairing disallows batch creation of releases. - Could do batch work creation and then match against releases, but meh.""" - release_batch = [] - for row in batch: - if row is None: - continue - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - ce_eg = self.api.create_editgroup(fatcat_client.Editgroup()) - container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id) - self.api.accept_editgroup(ce_eg.editgroup_id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - release_batch.append(re) - self.api.create_release_batch(release_batch, autoaccept="true") - self.counts['insert'] += len(release_batch) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 5e61a154..9d95fe0b 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,12 +5,22 @@ import json import base64 import datetime import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean MAX_ABSTRACT_BYTES=4096 -class GrobidMetadataImporter(FatcatImporter): +class GrobidMetadataImporter(EntityImporter): + """ + This is a complex case: we need to parse and create both file and release entities. + + The "primary" entity here is really File, not Release. If a matching File + exists, we bail in want(); if not we insert the Release during parsing, and + insert both. + + TODO: should instead check if the File has any releases; if not, insert and update. + TODO: relaxing 'None' constraint on parse_record() might make this refactor-able. + """ def __init__(self, api, **kwargs): @@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra) self.default_link_rel = kwargs.get("default_link_rel", "web") + self.longtail_oa = kwargs.get("longtail_oa", False) + + def want(self, raw_record): + return True + + def parse_record(self, row): + + fields = row.split('\t') + sha1_key = fields[0] + cdx = json.loads(fields[1]) + mimetype = fields[2] + file_size = int(fields[3]) + grobid_meta = json.loads(fields[4]) + fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) + re = self.parse_grobid_json(grobid_meta) + + if not (fe and re): + return None + + # lookup existing file SHA1 + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # if file is already in here, presumably not actually long-tail + # HACK: this is doing an exists check in parse_record(), which is weird + # TODO: this is where we should check if the file actually has + # release_ids and/or URLs associated with it + if existing and not self.bezerk_mode: + self.counts['exists'] += 1 + self.counts['skip'] -= 1 + return None + + release_edit = self.create_release(re) + fe.release_ids.append(release_edit.ident) + return fe def parse_grobid_json(self, obj): @@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter): abobj = dict( mimetype="text/plain", language=None, - content=obj.get('abstract').strip()) + content=clean(obj.get('abstract'))) abstracts = [abobj] else: abstracts = None @@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter): for i, a in enumerate(obj.get('authors', [])): contribs.append(fatcat_client.ReleaseContrib( index=i, - raw_name=a['name'], + raw_name=clean(a['name']), role="author", extra=None)) + # XXX: why is this a dict()? not covered by tests? refs = [] for raw in obj.get('citations', []): cite_extra = dict() ref = dict() - ref['key'] = raw.get('id') + ref['key'] = clean(raw.get('id')) if raw.get('title'): - ref['title'] = raw['title'].strip() + ref['title'] = clean(raw['title']) if raw.get('date'): try: year = int(raw['date'].strip()[:4]) @@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter): pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): - cite_extra[key] = raw[key].strip() + cite_extra[key] = clean(raw[key]) if raw.get('authors'): - cite_extra['authors'] = [a['name'] for a in raw['authors']] + cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] if cite_extra: cite_extra = dict(grobid=cite_extra) else: @@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter): if obj.get('doi'): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] - - extra['is_longtail_oa'] = True + extra['container_name'] = clean(obj['journal']['name']) # TODO: ISSN/eISSN handling? or just journal name lookup? + if self.longtail_oa: + extra['longtail_oa'] = True + if extra: extra = dict(grobid=extra) else: extra = None re = fatcat_client.ReleaseEntity( - title=obj['title'].strip(), + title=clean(obj['title'], force_xml=True), release_type="article-journal", release_date=release_date, release_year=release_year, contribs=contribs, refs=refs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), + publisher=clean(obj['journal'].get('publisher')), + volume=clean(obj['journal'].get('volume')), + issue=clean(obj['journal'].get('issue')), abstracts=abstracts, extra=extra) return re @@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter): sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() - # lookup existing SHA1, or create new entity - try: - existing_file = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - existing_file = None - - if existing_file: - # if file is already in here, presumably not actually long-tail - return None fe = fatcat_client.FileEntity( sha1=sha1, size=int(file_size), @@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter): # parse URLs and CDX original = cdx['url'] + assert len(cdx['dt']) >= 8 wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) @@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter): return fe - def create_row(self, row, editgroup_id=None): - if not row: - return - fields = row.split('\t') - sha1_key = fields[0] - cdx = json.loads(fields[1]) - mimetype = fields[2] - file_size = int(fields[3]) - grobid_meta = json.loads(fields[4]) - fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) - re = self.parse_grobid_json(grobid_meta) - if fe and re: - release_entity = self.api.create_release(re, editgroup_id=editgroup_id) - # release ident can't already be in release list because we just - # created it - fe.release_ids.append(release_entity.ident) - file_entity = self.api.create_file(fe, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - # NB: batch mode not implemented + def try_update(self, entity): + # did the exists check in 'parse_record()', because we needed to create a release + return True + + def insert_batch(self, batch): + self.api.create_file_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py deleted file mode 100644 index f4d525a4..00000000 --- a/python/fatcat_tools/importers/issn.py +++ /dev/null @@ -1,89 +0,0 @@ - -import sys -import json -import itertools -import fatcat_client -from .common import FatcatImporter - - -def or_none(s): - if s is None: - return None - if len(s) == 0: - return None - return s - -def truthy(s): - if s is None: - return None - s = s.lower() - - if s in ('true', 't', 'yes', 'y', '1'): - return True - elif s in ('false', 'f', 'no', 'n', '0'): - return False - else: - return None - -class IssnImporter(FatcatImporter): - """ - Imports journal metadata ("containers") by ISSN, currently from a custom - (data munged) .csv file format - - CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - - ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - """ - - def __init__(self, api, **kwargs): - - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra) - - def parse_issn_row(self, row): - """ - row is a python dict (parsed from CSV). - returns a ContainerEntity (or None if invalid or couldn't parse) - """ - title = or_none(row['title']) - issnl = or_none(row['ISSN-L']) - if title is None or issnl is None: - return None - extra = dict( - in_doaj=truthy(row['in_doaj']), - in_road=truthy(row['in_road']), - in_norwegian=truthy(row['in_norwegian']), - language=or_none(row['lang']), - url=or_none(row['url']), - ISSNp=or_none(row['ISSN-print']), - ISSNe=or_none(row['ISSN-electronic']), - is_oa=truthy(row['is_oa']), - is_kept=truthy(row['is_kept']), - ) - ce = fatcat_client.ContainerEntity( - issnl=issnl, - name=title, - publisher=or_none(row['publisher']), - abbrev=None, - coden=None, - extra=extra) - return ce - - def create_row(self, row, editgroup_id=None): - ce = self.parse_issn_row(row) - if ce is not None: - self.api.create_container(ce, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_issn_row(l) - for l in batch if (l is not None)] - objects = [o for o in objects if (o is not None)] - self.api.create_container_batch(objects, autoaccept="true") - self.counts['insert'] += len(objects) diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py new file mode 100644 index 00000000..cf3971b5 --- /dev/null +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -0,0 +1,183 @@ + +import sys +import json +import itertools +import fatcat_client +from .common import EntityImporter, clean + + +def or_none(s): + if s is None: + return None + if len(s) == 0: + return None + return s + +def truthy(s): + if s is None: + return None + s = s.lower() + + if s in ('true', 't', 'yes', 'y', '1'): + return True + elif s in ('false', 'f', 'no', 'n', '0'): + return False + else: + return None + +class JournalMetadataImporter(EntityImporter): + """ + Imports journal metadata ("containers") by ISSN, currently from a custom + (data munged) .csv file format + + CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): + + ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + + + 'extra' fields: + + doaj + as_of: datetime of most recent check; if not set, not actually in DOAJ + seal: bool + work_level: bool (are work-level publications deposited with DOAJ?) + archiving: array, can include 'library' or 'other' + road + as_of: datetime of most recent check; if not set, not actually in ROAD + pubmed (TODO: delete?) + as_of: datetime of most recent check; if not set, not actually indexed in pubmed + norwegian (TODO: drop this?) + as_of: datetime of most recent check; if not set, not actually indexed in pubmed + id (integer) + level (integer; 0-2) + kbart + lockss + year_rle + volume_rle + portico + ... + clockss + ... + sherpa_romeo + color + jstor + year_rle + volume_rle + scopus + id + TODO: print/electronic distinction? + wos + id + doi + crossref_doi: DOI of the title in crossref (if exists) + prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref) + ia + sim + nap_id + year_rle + volume_rle + longtail: boolean + homepage + as_of: datetime of last attempt + url + status: HTTP/heritrix status of homepage crawl + + issnp: string + issne: string + coden: string + abbrev: string + oclc_id: string (TODO: lookup?) + lccn_id: string (TODO: lookup?) + dblb_id: string + default_license: slug + original_name: native name (if name is translated) + platform: hosting platform: OJS, wordpress, scielo, etc + mimetypes: array of strings (eg, 'application/pdf', 'text/html') + first_year: year (integer) + last_year: if publishing has stopped + primary_language: single ISO code, or 'mixed' + languages: array of ISO codes + region: TODO: continent/world-region + nation: shortcode of nation + discipline: TODO: highest-level subject; "life science", "humanities", etc + field: TODO: narrower description of field + subjects: TODO? + url: homepage + is_oa: boolean. If true, can assume all releases under this container are "Open Access" + TODO: domains, if exclusive? + TODO: fulltext_regex, if a known pattern? + + For KBART, etc: + We "over-count" on the assumption that "in-progress" status works will soon actually be preserved. + year and volume spans are run-length-encoded arrays, using integers: + - if an integer, means that year is preserved + - if an array of length 2, means everything between the two numbers (inclusive) is preserved + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + + def want(self, raw_record): + if raw_record.get('ISSN-L'): + return True + return False + + def parse_record(self, row): + """ + row is a python dict (parsed from CSV). + returns a ContainerEntity (or None if invalid or couldn't parse) + """ + title = or_none(row['title']) + issnl = or_none(row['ISSN-L']) + if title is None or issnl is None: + return None + extra = dict( + in_doaj=truthy(row['in_doaj']), + in_road=truthy(row['in_road']), + in_norwegian=truthy(row['in_norwegian']), + language=or_none(row['lang']), + url=or_none(row['url']), + ISSNp=or_none(row['ISSN-print']), + ISSNe=or_none(row['ISSN-electronic']), + is_oa=truthy(row['is_oa']), + is_kept=truthy(row['is_kept']), + ) + ce = fatcat_client.ContainerEntity( + issnl=issnl, + name=clean(title), + publisher=or_none(clean(row['publisher'])), + extra=extra) + return ce + + def try_update(self, ce): + + existing = None + try: + existing = self.api.lookup_container(issnl=ce.issnl) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_container_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 1e5c22f7..2ec6c95d 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,16 +4,10 @@ import json import sqlite3 import itertools import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean -#row = row.split('\t') -#assert len(row) == 2 -#sha1 = row[0].replace('sha1:') -#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() -#print(sha1) -#dois = [d.lower() for d in json.loads(row[1])] -class MatchedImporter(FatcatImporter): +class MatchedImporter(EntityImporter): """ Importer for "file to crossref DOI" matches. @@ -48,7 +42,6 @@ class MatchedImporter(FatcatImporter): editgroup_extra=eg_extra) self.default_link_rel = kwargs.get("default_link_rel", "web") self.default_mime = kwargs.get("default_mime", None) - self.skip_file_updates = kwargs.get("skip_file_updates", False) def make_url(self, raw): rel = self.default_link_rel @@ -59,26 +52,13 @@ class MatchedImporter(FatcatImporter): rel = "repository" elif "//web.archive.org/" in raw or "//archive.is/" in raw: rel = "webarchive" - return fatcat_client.FileEntityUrls(url=raw, rel=rel) + return (rel, raw) - def parse_matched_dict(self, obj): - sha1 = obj['sha1'] - dois = [d.lower() for d in obj.get('dois', [])] + def want(self, raw_record): + return True - # lookup sha1, or create new entity - fe = None - if not self.skip_file_updates: - try: - fe = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - if fe is None: - fe = fatcat_client.FileEntity( - sha1=sha1, - release_ids=[], - urls=[], - ) + def parse_record(self, obj): + dois = [d.lower() for d in obj.get('dois', [])] # lookup dois re_list = set() @@ -93,67 +73,77 @@ class MatchedImporter(FatcatImporter): print("DOI not found: {}".format(doi)) else: re_list.add(re.ident) - if len(re_list) == 0: + release_ids = list(re_list) + if len(release_ids) == 0: return None - if fe.release_ids == set(re_list): - return None - re_list.update(fe.release_ids) - fe.release_ids = list(re_list) # parse URLs and CDX - existing_urls = [feu.url for feu in fe.urls] + urls = set() for url in obj.get('url', []): - if url not in existing_urls: - url = self.make_url(url) - if url != None: - fe.urls.append(url) + url = self.make_url(url) + if url != None: + urls.add(url) for cdx in obj.get('cdx', []): original = cdx['url'] wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) - if wayback not in existing_urls: - fe.urls.append( - fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) - if original not in existing_urls: - url = self.make_url(original) - if url != None: - fe.urls.append(url) - - if obj.get('size') != None: - fe.size = int(obj['size']) - fe.sha256 = obj.get('sha256', fe.sha256) - fe.md5 = obj.get('md5', fe.sha256) - if obj.get('mimetype') is None: - if fe.mimetype is None: - fe.mimetype = self.default_mime - else: - fe.mimetype = obj.get('mimetype') + urls.add(("webarchive", wayback)) + url = self.make_url(original) + if url != None: + urls.add(url) + urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls] + if len(urls) == 0: + return None + + size = obj.get('size') + if size: + size = int(size) + + fe = fatcat_client.FileEntity( + md5=obj.get('md5'), + sha1=obj['sha1'], + sha256=obj.get('sha256'), + size=size, + mimetype=obj.get('mimetype'), + release_ids=release_ids, + urls=urls, + ) return fe - def create_row(self, row, editgroup_id=None): - obj = json.loads(row) - fe = self.parse_matched_dict(obj) - if fe is not None: - if fe.ident is None: - self.api.create_file(fe, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - else: - self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id) - self.counts['update'] += 1 - - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_matched_dict(json.loads(l)) - for l in batch if l != None] - new_objects = [o for o in objects if o != None and o.ident == None] - update_objects = [o for o in objects if o != None and o.ident != None] - if len(update_objects): - update_eg = self._editgroup().editgroup_id - for obj in update_objects: - self.api.update_file(obj.ident, obj, editgroup_id=update_eg) - self.api.accept_editgroup(update_eg) - if len(new_objects) > 0: - self.api.create_file_batch(new_objects, autoaccept="true") - self.counts['update'] += len(update_objects) - self.counts['insert'] += len(new_objects) + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + fe.release_ids = list(set(fe.release_ids + existing.release_ids)) + if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0: + # no new release matches *and* there are already existing URLs + self.counts['exists'] += 1 + return False + + # merge the existing into this one and update + existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) + existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] + existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + existing.mimetype = existing.mimetype or fe.mimetype + existing.size = existing.size or fe.size + existing.md5 = existing.md5 or fe.md5 + existing.sha256 = existing.sha256 or fe.sha256 + self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup()) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 0c8b1d62..02c9bf00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -3,7 +3,7 @@ import sys import json import itertools import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean def value_or_none(e): if type(e) == dict: @@ -20,7 +20,7 @@ def value_or_none(e): return None return e -class OrcidImporter(FatcatImporter): +class OrcidImporter(EntityImporter): def __init__(self, api, **kwargs): @@ -32,14 +32,16 @@ class OrcidImporter(FatcatImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra) - def parse_orcid_dict(self, obj): + def want(self, raw_record): + return True + + def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a CreatorEntity """ name = obj['person']['name'] - if name is None: - return None + assert name extra = None given = value_or_none(name.get('given-names')) sur = value_or_none(name.get('family-name')) @@ -61,23 +63,30 @@ class OrcidImporter(FatcatImporter): return None ce = fatcat_client.CreatorEntity( orcid=orcid, - given_name=given, - surname=sur, - display_name=display, + given_name=clean(given), + surname=clean(sur), + display_name=clean(display), extra=extra) return ce - def create_row(self, row, editgroup_id=None): - obj = json.loads(row) - ce = self.parse_orcid_dict(obj) - if ce is not None: - self.api.create_creator(ce, editgroup_id=editgroup_id) - self.counts['insert'] += 1 + def try_update(self, raw_record): + existing = None + try: + existing = self.api.lookup_creator(orcid=raw_record.orcid) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_orcid_dict(json.loads(l)) - for l in batch if l != None] - objects = [o for o in objects if o != None] - self.api.create_creator_batch(objects, autoaccept="true") - self.counts['insert'] += len(objects) + def insert_batch(self, batch): + self.api.create_creator_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index 0f957f9a..2493b1ab 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -1,4 +1,5 @@ + import collections from fatcat_client import ReleaseEntity, ApiClient @@ -26,25 +27,43 @@ def release_to_elasticsearch(release): Raises exception on error (never returns None) """ - if release.state != 'active': - raise ValueError("Entity is not 'active'") + if release.state in ('redirect', 'deleted'): + return dict( + ident = release.ident, + state = release.state, + ) + elif release.state != 'active': + raise ValueError("Unhandled release state: {}".format(release.state)) # First, the easy ones (direct copy) t = dict( ident = release.ident, + state = release.state, revision = release.revision, title = release.title, + original_title = release.original_title, release_type = release.release_type, release_status = release.release_status, language = release.language, + license = release.license_slug, doi = release.doi, pmid = release.pmid, pmcid = release.pmcid, isbn13 = release.isbn13, + wikidata_qid = release.wikidata_qid, core_id = release.core_id, - wikidata_qid = release.wikidata_qid + arxiv_id = release.core_id, + jstor_id = release.jstor_id, ) + is_oa = None + is_longtail_oa = None + in_kbart = None + in_web = False + in_dweb = False + in_ia = False + in_shadow = False + if release.release_date: # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) t['release_date'] = release.release_date.isoformat() @@ -53,52 +72,99 @@ def release_to_elasticsearch(release): if release.release_year is not None: t['release_year'] = release.release_year + t['any_abstract'] = len(release.abstracts) > 0 + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + container = release.container - container_is_kept = False if container: t['publisher'] = container.publisher t['container_name'] = container.name t['container_issnl'] = container.issnl - container_extra = container.extra - if container_extra: - t['container_is_oa'] = container_extra.get('is_oa') - container_is_kept = container_extra.get('is_kept', False) - t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + t['container_type'] = container.container_type + if container.extra: + if container.extra.get('is_oa') or container.extra.get('in_doaj'): + is_oa = True + if container.extra.get('in_kbart'): + # TODO: better KBART check goes here + in_kbart = True + if container.extra.get('ia'): + # TODO: container longtail check goes here + # TODO: sim/microfilm check goes here + pass + # TODO: SHERPA/Romeo goes here else: t['publisher'] = release.publisher files = release.files or [] t['file_count'] = len(files) - in_wa = False - in_ia = False - t['file_pdf_url'] = None + t['fileset_count'] = len(release.filesets or []) + t['webcapture_count'] = len(release.webcaptures or []) + any_pdf_url = None + good_pdf_url = None + best_pdf_url = None + ia_pdf_url = None for f in files: + if f.extra and f.extra.get('shadows'): + # TODO: shadow check goes here + in_shadows = True is_pdf = 'pdf' in (f.mimetype or '') for url in (f.urls or []): - if url.rel == 'webarchive': - in_wa = True - if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''): + if url.url.lower().startswith('http'): + in_web = True + if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + # TODO: not sure what rel will be + in_dweb = True + if is_pdf: + any_pdf_url = url.url + if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: + is_preserved = True + good_pdf_url = url.url + if '//web.archive.org/' in url.url or '//archive.org/' in url.url: in_ia = True if is_pdf: - t['file_pdf_url'] = url.url - if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url.url - t['file_in_webarchive'] = in_wa - t['file_in_ia'] = in_ia + best_pdf_url = url.url + ia_pdf_url = url.url + # here is where we bake-in priority; IA-specific + t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url + t['ia_pdf_url'] = ia_pdf_url + + if release.license_slug: + # TODO: more/better checks here, particularly strict *not* OA licenses + if release.license_slug.startswith("CC-"): + is_oa = True extra = release.extra or dict() if extra: - t['in_shadow'] = extra.get('in_shadow') - if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): - t['container_is_longtail_oa'] = True - t['any_abstract'] = bool(release.abstracts) - t['is_kept'] = container_is_kept or extra.get('is_kept', False) + # TODO: longtail OA check from GROBID here + if extra.get('in_kbart'): + # NOTE: not actually setting this anywhere + in_kbart = True + if extra.get('is_oa'): + # NOTE: not actually setting this anywhere + is_oa = True + if extra.get('grobid'): + if not t.get('container_name'): + t['container_name'] = extra['grobid'].get('container_name') + if extra['grobid'].get('longtail_oa'): + is_longtail_oa = True + if extra.get('crossref'): + if extra['crossref'].get('archive'): + # all crossref archives are KBART, I believe + in_kbart = True - t['ref_count'] = len(release.refs or []) - t['contrib_count'] = len(release.contribs or []) - contrib_names = [] - for c in (release.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names + if is_longtail_oa: + is_oa = True + t['is_oa'] = is_oa + t['is_longtail_oa'] = is_longtail_oa + t['in_kbart'] = in_kbart + t['in_web'] = in_web + t['in_dweb'] = in_dweb + t['in_ia'] = in_ia + t['is_preserved'] = in_ia or in_kbart return t diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 8690a791..636ed304 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -93,7 +93,7 @@ class EntityUpdatesWorker(FatcatWorker): release_edits = cle['editgroup']['edits']['releases'] for re in release_edits: ident = re['ident'] - release = self.api.get_release(ident, expand="files,container") + release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") release_dict = self.api.api_client.sanitize_for_serialization(release) producer.produce( message=json.dumps(release_dict).encode('utf-8'), diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py index 8035cbe5..03964c92 100644 --- a/python/fatcat_web/auth.py +++ b/python/fatcat_web/auth.py @@ -90,7 +90,10 @@ def handle_ia_xauth(email, password): 'secret': Config.IA_XAUTH_CLIENT_SECRET, }) if resp.status_code == 401 or (not resp.json().get('success')): - flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason'])) + try: + flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason'])) + except: + print("IA XAuth fail: {}".format(resp.content)) return render_template('auth_ia_login.html', email=email), resp.status_code elif resp.status_code != 200: flash("Internet Archive login failed (internal error?)") diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index a5927d9b..926d5340 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -4,7 +4,7 @@ import json from flask import Flask, render_template, send_from_directory, request, \ url_for, abort, g, redirect, jsonify, session, flash from flask_login import login_required -from fatcat_web import app, api, auth_api +from fatcat_web import app, api, auth_api, priv_api from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth from fatcat_client.rest import ApiException from fatcat_web.search import do_search @@ -368,6 +368,8 @@ def search(): @app.route('/auth/login') def login(): # show the user a list of login options + if not priv_api: + flash("This web interface not configured with credentials to actually allow login (other than via token)") return render_template('auth_login.html') @app.route('/auth/ia/login', methods=['GET', 'POST']) diff --git a/python/fatcat_web/templates/container_view.html b/python/fatcat_web/templates/container_view.html index 29f0b9d9..4a175a5d 100644 --- a/python/fatcat_web/templates/container_view.html +++ b/python/fatcat_web/templates/container_view.html @@ -15,12 +15,6 @@ <p><b>Publisher:</b> {% if container.publisher != None %}{{ container.publisher }}{% else %}<i>Unknown</i>{% endif %} -{% if container.coden != None %} -<br><b>CODEN<sup><a href="https://en.wikipedia.org/wiki/CODEN">?</a></sup>:</b> <code>{{ container.coden }}</code> -{% endif %} -{% if container.abbrev != None %} -<br><b>Abbrev.:</b> <code>{{ container.abbrev }}</code> -{% endif %} {% if (container.extra != None) and (container.extra['url'] != None) and (container.extra['url']|length > 0) %} <br><b>Homepage:</b> <a href="{{ container.extra['url'] }}"> <code>{{ container.extra['url'] }}</code></a> {% endif %} diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index fd86b7c9..4e24b281 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -143,7 +143,7 @@ Raw Object: {% endif %} <br> -{% if release.refs.size != 0 %} +{% if release.refs != None and release.refs.size != 0 %} <h3>References</h3> This release citing other releases. <ol> diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py index cbe519b0..9ce32ed7 100644 --- a/python/fatcat_web/web_config.py +++ b/python/fatcat_web/web_config.py @@ -19,7 +19,7 @@ class Config(object): GIT_REVISION = subprocess.check_output(["git", "describe", "--always"]).strip().decode('utf-8') # This is, effectively, the QA/PROD flag - FATCAT_DOMAIN = os.environ.get("FATCAT_DOMAIN", default="qa.fatcat.wiki") + FATCAT_DOMAIN = os.environ.get("FATCAT_DOMAIN", default="dev.fatcat.wiki") FATCAT_API_AUTH_TOKEN = os.environ.get("FATCAT_API_AUTH_TOKEN", default=None) FATCAT_API_HOST = os.environ.get("FATCAT_API_HOST", default="https://{}/v0".format(FATCAT_DOMAIN)) @@ -39,10 +39,11 @@ class Config(object): IA_XAUTH_CLIENT_SECRET = os.environ.get("IA_XAUTH_CLIENT_SECRET", default=None) # protect cookies (which include API tokens) - SESSION_COOKIE_HTTPONLY = True - SESSION_COOKIE_SECURE = True - SESSION_COOKIE_SAMESITE = 'Lax' - PERMANENT_SESSION_LIFETIME = 2678400 # 31 days, in seconds + if FATCAT_DOMAIN != "dev.fatcat.wiki": + SESSION_COOKIE_HTTPONLY = True + SESSION_COOKIE_SECURE = True + SESSION_COOKIE_SAMESITE = 'Lax' + PERMANENT_SESSION_LIFETIME = 2678400 # 31 days, in seconds try: GIT_RELEASE = raven.fetch_git_sha('..') diff --git a/python/tests/api_annotations.py b/python/tests/api_annotations.py new file mode 100644 index 00000000..0d3c5046 --- /dev/null +++ b/python/tests/api_annotations.py @@ -0,0 +1,39 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_annotations(api): + + eg = quick_eg(api) + + # ensure no annotations on this object + a = api.get_editgroup_annotations(eg.editgroup_id) + assert a == [] + + # create an annotation! + api.create_editgroup_annotation( + eg.editgroup_id, + EditgroupAnnotation( + comment_markdown="some *annotation*", + extra=dict(thing="thang"))) + + # check that we can fetch it all sorts of ways + a = api.get_editgroup_annotations(eg.editgroup_id) + assert len(a) == 1 + assert a[0].extra['thing'] == "thang" + + # the editor persists, so this is a hack to find a "recent" one + a2 = api.get_editor_annotations(eg.editor_id, limit=100) + found = None + for thing in a2: + if thing.annotation_id == a[0].annotation_id: + found = thing + break + assert thing + assert thing.extra['thing'] == "thang" diff --git a/python/tests/api_containers.py b/python/tests/api_containers.py new file mode 100644 index 00000000..674ae3b8 --- /dev/null +++ b/python/tests/api_containers.py @@ -0,0 +1,48 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_container(api): + eg = quick_eg(api) + + # all the fields! + c1 = ContainerEntity( + name="some container name", + container_type="journal", + publisher="some container publisher", + issnl="1234-567X", + wikidata_qid="Q954248", + extra=dict(a=1, b=2), + ) + + c1edit = api.create_container(c1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + c2 = api.get_container(c1edit.ident) + + # check that fields match + assert c1.name == c2.name + assert c1.container_type == c2.container_type + assert c1.publisher == c2.publisher + assert c1.issnl == c2.issnl + assert c1.wikidata_qid == c2.wikidata_qid + assert c1.extra == c2.extra + + # expansion + # TODO: via release + # lookup + # TODO: via issnl; but need to generate random identifiers + +def test_container_examples(api): + + api.lookup_container(issnl='1549-1277') + + c1 = api.get_container('aaaaaaaaaaaaaeiraaaaaaaaam') + assert c1.name == "PLOS Medicine" + assert c1.issnl == "1549-1277" + diff --git a/python/tests/api_creators.py b/python/tests/api_creators.py new file mode 100644 index 00000000..7443675b --- /dev/null +++ b/python/tests/api_creators.py @@ -0,0 +1,44 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_creators(api): + eg = quick_eg(api) + + # all the fields! + c1 = CreatorEntity( + display_name="Emma Smith", + given_name="emma", + surname="smith", + orcid="0000-0002-1825-0097", + wikidata_qid="Q9542248", + extra=dict(a=1, b=5), + ) + + c1edit = api.create_creator(c1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + c2 = api.get_creator(c1edit.ident) + + # check that fields match + assert c1.display_name == c2.display_name + assert c1.given_name == c2.given_name + assert c1.surname == c2.surname + assert c1.orcid == c2.orcid + assert c1.wikidata_qid == c2.wikidata_qid + assert c1.extra == c2.extra + + # expansion + # TODO: via release + # lookup + # TODO: via issnl; but need to generate random identifiers + +def test_creators_examples(api): + # TODO: aaaaaaaaaaaaaircaaaaaaaaam + + api.lookup_creator(orcid='0000-0003-3118-6859') diff --git a/python/tests/api_editgroups.py b/python/tests/api_editgroups.py new file mode 100644 index 00000000..722d8686 --- /dev/null +++ b/python/tests/api_editgroups.py @@ -0,0 +1,140 @@ + +import json +import pytest +import datetime +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_editgroup_submit(api): + # 1. check that edit group can be submitted/unsubmitted, and shows up in reviewable appropriately + # 2. accepted edits don't show up as reviewable and can't be submitted + + c1 = CreatorEntity(display_name="test updates") + eg = quick_eg(api) + c1 = api.get_creator(api.create_creator(c1, editgroup_id=eg.editgroup_id).ident) + + eg2 = api.get_editgroup(eg.editgroup_id) + assert not eg2.submitted + assert not eg2.changelog_index + + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id not in [v.editgroup_id for v in reviewable] + wip = api.get_editor_editgroups(eg.editor_id, limit=100) + assert eg.editgroup_id in [v.editgroup_id for v in wip] + + api.update_editgroup(eg.editgroup_id, eg2, submit=True) + eg3 = api.get_editgroup(eg.editgroup_id) + assert eg3.submitted + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id in [v.editgroup_id for v in reviewable] + + api.update_editgroup(eg.editgroup_id, eg2, submit=False) + eg3 = api.get_editgroup(eg.editgroup_id) + assert not eg3.submitted + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id not in [v.editgroup_id for v in reviewable] + + # put back in reviewable + api.update_editgroup(eg.editgroup_id, eg2, submit=True) + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id in [v.editgroup_id for v in reviewable] + + # shouldn't be reviewable if accepted + api.accept_editgroup(eg.editgroup_id) + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id not in [v.editgroup_id for v in reviewable] + eg3 = api.get_editgroup(eg.editgroup_id) + #print(eg3) + assert eg3.submitted + assert eg3.changelog_index + + with pytest.raises(fatcat_client.rest.ApiException): + api.update_editgroup(eg.editgroup_id, eg3, submit=True) + with pytest.raises(fatcat_client.rest.ApiException): + eg3.description = "something" + api.update_editgroup(eg.editgroup_id, eg3) + + +def test_editgroup_ordering(api): + + eg1 = quick_eg(api) + eg2 = quick_eg(api) + api.update_editgroup( + eg1.editgroup_id, + Editgroup(editgroup_id=eg1.editgroup_id, description="FAIL"), + submit=True) + api.update_editgroup( + eg2.editgroup_id, + Editgroup(editgroup_id=eg2.editgroup_id, description="FAIL"), + submit=True) + + r1 = api.get_editgroups_reviewable() + #print(r1) + assert not r1[0].description + assert not r1[1].description + assert r1[0].submitted >= r1[1].submitted + + # should be no editgroups "in the future" (since now + 1sec) + r1 = api.get_editgroups_reviewable(since=(datetime.datetime.utcnow() + datetime.timedelta(seconds=1)).isoformat()+"Z") + assert not r1 + + r1 = api.get_editgroups_reviewable(since=(datetime.datetime.utcnow() - datetime.timedelta(seconds=5)).isoformat()+"Z") + assert r1[0].submitted <= r1[1].submitted + + +def test_editgroup_autoaccept(api): + # autoaccept changes: editgroups required when, in what combination + + eg = quick_eg(api) + c1 = CreatorEntity(display_name="test autoaccept") + c2 = CreatorEntity(display_name="test another autoaccept") + + with pytest.raises(fatcat_client.rest.ApiException): + edits = api.create_creator_batch([c1, c2]) + + with pytest.raises(fatcat_client.rest.ApiException): + edits = api.create_creator_batch([c1, c2], editgroup_id=eg.editgroup_id, autoaccept=True) + + edits1 = api.create_creator_batch([c1, c2], editgroup_id=eg.editgroup_id) + edits2 = api.create_creator_batch([c1, c2], autoaccept=True) + + assert edits1[0].editgroup_id == eg.editgroup_id + assert edits1[0].editgroup_id != edits2[1].editgroup_id + eg1 = api.get_editgroup(edits1[0].editgroup_id) + eg2 = api.get_editgroup(edits2[0].editgroup_id) + + assert not eg1.changelog_index + assert eg2.changelog_index + #print(edits1) + #print(eg1.edits.creators) + assert eg1.edits.creators[0].ident in [t.ident for t in edits1] + assert eg2.edits.creators[0].ident in [t.ident for t in edits2] + + +def test_batch_params(api): + + eg = quick_eg(api) + c1 = CreatorEntity(display_name="test autoaccept") + c2 = CreatorEntity(display_name="test another autoaccept") + + with pytest.raises(fatcat_client.rest.ApiException): + edits = api.create_creator_batch([c1, c2]) + + desc = "test description" + extra = dict(a=75, q="thing") + edits = api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra=json.dumps(extra)) + eg = api.get_editgroup(edits[0].editgroup_id) + + assert eg.description == desc + assert eg.extra == extra + + # currently must manually json dumps() extra field + with pytest.raises(fatcat_client.rest.ApiException): + api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra=extra) + + with pytest.raises(fatcat_client.rest.ApiException): + api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra="{") diff --git a/python/tests/api_files.py b/python/tests/api_files.py new file mode 100644 index 00000000..033538ef --- /dev/null +++ b/python/tests/api_files.py @@ -0,0 +1,52 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_file(api): + + eg = quick_eg(api) + + # all the fields! + f1 = FileEntity( + size=89238, + md5="7ce6615b2a5904939576d9567bd5f68e", + sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2", + sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3", + mimetype="application/pdf", + extra=dict(a=2, b=5), + urls=[ + FileEntityUrls(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), + ], + release_ids=[], + ) + + f1edit = api.create_file(f1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + f2 = api.get_file(f1edit.ident) + + # check that fields match + assert f1.size == f2.size + assert f1.md5 == f2.md5 + assert f1.sha1 == f2.sha1 + assert f1.sha256 == f2.sha256 + assert f1.mimetype == f2.mimetype + assert f1.extra == f2.extra + assert f1.urls == f2.urls + assert f1.release_ids == f2.release_ids + + # expansion + # TODO: via release + # lookup + # TODO: via hashes; but need to generate random? + +def test_file_examples(api): + + api.lookup_file(sha256='ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362') + + f1 = api.get_file('aaaaaaaaaaaaamztaaaaaaaaam') diff --git a/python/tests/api_filesets.py b/python/tests/api_filesets.py new file mode 100644 index 00000000..966b85ca --- /dev/null +++ b/python/tests/api_filesets.py @@ -0,0 +1,79 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_fileset(api): + + eg = quick_eg(api) + r1 = ReleaseEntity(title="test fileset release") + r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id) + + fs1 = FilesetEntity( + manifest = [ + FilesetEntityManifest( + path="data/thing.tar.gz", + size=54321, + md5="540da3ea6e448d8dfb057c05225f853a", + sha1="1dab6a0e110f9b5d70b18db0abf051f7f93faf06", + sha256="c7b49f3e84cd1b7cb0b0e3e9f632b7be7e21b4dc229df23331f880a8a7dfa75a", + extra={"a": 1, "b": 3}, + ), + FilesetEntityManifest( + path="README.md", + size=54210, + md5="5f83592b5249671719bbed6ce91ecfa8", + sha1="455face3598611458efe1f072e58624790a67266", + sha256="429bcafa4d3d0072d5b2511e12c85c1aac1d304011d1c406da14707f7b9cd905", + extra={"x": 1, "y": "q"}, + ), + ], + urls = [ + FileEntityUrls(url="https://archive.org/download/fileset-123/", rel="repository"), + FileEntityUrls(url="https://humble-host.com/~user123/dataset/", rel="web"), + ], + release_ids = [r1edit.ident], + ) + + fs1edit = api.create_fileset(fs1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + fs2 = api.get_fileset(fs1edit.ident) + + # check that fields match + assert fs1.urls == fs2.urls + assert fs1.manifest == fs2.manifest + assert fs1.release_ids == fs2.release_ids + + # expansion + r1 = api.get_release(r1edit.ident, expand="filesets") + assert r1.filesets[0].manifest == fs1.manifest + +def test_fileset_examples(api): + fs3 = api.get_fileset('aaaaaaaaaaaaaztgaaaaaaaaam') + + assert fs3.urls[0].url == 'http://other-personal-blog.name/dataset/' + assert fs3.urls[1].rel == 'archive' + assert fs3.manifest[1].md5 == 'f4de91152c7ab9fdc2a128f962faebff' + assert fs3.manifest[1].extra['mimetype'] == 'application/gzip' + +def test_bad_fileset(api): + + eg = quick_eg(api) + + bad_list = [ + # good (for testing test itself) + #FilesetEntity(manifest=[FilesetEntityManifest(path="123.jpg", size=1234)]), + #FilesetEntity(urls=[FileEntityUrls(url="thing", rel="blah")]), + FilesetEntity(manifest=[FilesetEntityManifest(path="123.jpg", size="big")]), + FilesetEntity(release_ids=["asdf"]), + ] + + for b in bad_list: + with pytest.raises(fatcat_client.rest.ApiException): + api.create_fileset(b, editgroup_id=eg.editgroup_id) + diff --git a/python/tests/api_misc.py b/python/tests/api_misc.py index 3510ea82..0a0f16da 100644 --- a/python/tests/api_misc.py +++ b/python/tests/api_misc.py @@ -8,14 +8,6 @@ from fatcat_client.rest import ApiException from fixtures import * -def test_lookups(api): - - api.lookup_creator(orcid='0000-0003-3118-6859') - api.lookup_container(issnl='1549-1277') - api.lookup_file(sha256='ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362') - api.lookup_release(pmid='54321') - api.lookup_release(isbn13='978-3-16-148410-0') - def test_lookup_hide_extend(api): r = api.lookup_release(doi='10.1371/journal.pmed.0020124') diff --git a/python/tests/api_releases.py b/python/tests/api_releases.py new file mode 100644 index 00000000..ed6f24a4 --- /dev/null +++ b/python/tests/api_releases.py @@ -0,0 +1,103 @@ + +import json +import pytest +import datetime +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_release(api): + + eg = quick_eg(api) + + # all the fields! + r1 = ReleaseEntity( + title="some title", + original_title="оригинальное название", + release_type="post-weblog", + release_status="pre-print", + release_date=datetime.datetime.utcnow().date(), + release_year=2015, + doi="10.5555/12345678", + pmid="12345", + pmcid="PMC4321", + wikidata_qid="Q1234", + isbn13="978-3-16-148410-0", + core_id="187348", + arxiv_id="aslkdjfh", + jstor_id="8328424", + volume="84", + issue="XII", + pages="4-99", + publisher="some publisher", + language="en", + license_slug="CC-0", + extra=dict(a=1, b=2), + contribs=[], + refs=[], + abstracts=[ + ReleaseEntityAbstracts( + content="this is some abstract", + mimetype="text/plain", + lang="en"), + ReleaseEntityAbstracts( + content="this is some other abstract", + mimetype="text/plain", + lang="de"), + ], + ) + + r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r2 = api.get_release(r1edit.ident) + + # check that fields match + assert r1.title == r2.title + assert r1.original_title == r2.original_title + assert r1.release_type == r2.release_type + assert r1.release_date == r2.release_date + assert r1.release_year == r2.release_year + assert r1.doi == r2.doi + assert r1.pmid == r2.pmid + assert r1.pmcid == r2.pmcid + assert r1.wikidata_qid == r2.wikidata_qid + assert r1.isbn13 == r2.isbn13 + assert r1.core_id == r2.core_id + assert r1.arxiv_id == r2.arxiv_id + assert r1.jstor_id == r2.jstor_id + assert r1.volume == r2.volume + assert r1.issue == r2.issue + assert r1.pages == r2.pages + assert r1.publisher == r2.publisher + assert r1.language == r2.language + assert r1.license_slug == r2.license_slug + assert r1.extra == r2.extra + + for i in range(len(r1.abstracts)): + r1.abstracts[i].content == r2.abstracts[i].content + r1.abstracts[i].mimetype == r2.abstracts[i].mimetype + r1.abstracts[i].lang == r2.abstracts[i].lang + for i in range(len(r1.contribs)): + r1.contribs[i] == r2.contribs[i] + for i in range(len(r1.refs)): + r1.refs[i] == r2.refs[i] + + # expansion + # TODO: via work + # lookup + # TODO: via all; but need to generate random identifiers + +def test_release_examples(api): + + api.lookup_release(pmid='54321') + api.lookup_release(isbn13='978-3-16-148410-0') + + r1 = api.get_release('aaaaaaaaaaaaarceaaaaaaaaai') + assert r1.title == "bigger example" + assert len(r1.refs) == 5 + assert r1.contribs[0].role == "editor" + assert r1.abstracts[0].mimetype == "application/xml+jats" + diff --git a/python/tests/api_webcaptures.py b/python/tests/api_webcaptures.py new file mode 100644 index 00000000..dc1754b3 --- /dev/null +++ b/python/tests/api_webcaptures.py @@ -0,0 +1,96 @@ + +import json +import pytest +import datetime +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_webcapture(api): + + eg = quick_eg(api) + r1 = ReleaseEntity(title="test webcapture release") + r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id) + + wc1 = WebcaptureEntity( + original_url = "http://example.site", + #timestamp = "2012-01-02T03:04:05Z", + timestamp = datetime.datetime.now(datetime.timezone.utc), + cdx = [ + WebcaptureEntityCdx( + surt="site,example,)/data/thing.tar.gz", + #timestamp="2012-01-02T03:04:05Z", + timestamp=datetime.datetime.now(datetime.timezone.utc), + url="http://example.site/data/thing.tar.gz", + mimetype="application/gzip", + status_code=200, + sha1="455face3598611458efe1f072e58624790a67266", + sha256="c7b49f3e84cd1b7cb0b0e3e9f632b7be7e21b4dc229df23331f880a8a7dfa75a", + ), + WebcaptureEntityCdx( + surt="site,example,)/README.md", + #timestamp="2012-01-02T03:04:05Z", + timestamp=datetime.datetime.now(datetime.timezone.utc), + url="http://example.site/README.md", + mimetype="text/markdown", + status_code=200, + sha1="455face3598611458efe1f072e58624790a67266", + sha256="429bcafa4d3d0072d5b2511e12c85c1aac1d304011d1c406da14707f7b9cd905", + ), + ], + archive_urls = [ + FileEntityUrls(rel="wayback", url="https://web.archive.org/web/"), + ], + release_ids = [r1edit.ident], + ) + + wc1edit = api.create_webcapture(wc1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + wc2 = api.get_webcapture(wc1edit.ident) + + # check that fields match + # I don't know why these aren't equal... + #print(wc1.archive_urls) + #print(wc2.archive_urls) + #assert wc1.archive_urls == wc2.archive_urls + assert wc1.archive_urls[0].rel == wc2.archive_urls[0].rel + assert wc1.archive_urls[0].url == wc2.archive_urls[0].url + assert wc1.cdx == wc2.cdx + assert wc1.release_ids == wc2.release_ids + assert wc1.timestamp == wc2.timestamp + assert wc1.original_url == wc2.original_url + + # TODO: check release expansion + r1 = api.get_release(r1edit.ident, expand="webcaptures") + print(r1) + assert r1.webcaptures[0].cdx == wc1.cdx + +def test_webcapture_examples(api): + wc3 = api.get_webcapture('aaaaaaaaaaaaa53xaaaaaaaaam') + + assert wc3.cdx[0].surt == 'org,asheesh)/' + assert wc3.cdx[1].sha1 == 'a637f1d27d9bcb237310ed29f19c07e1c8cf0aa5' + assert wc3.archive_urls[1].rel == 'warc' + + +def test_bad_webcapture(api): + + eg = quick_eg(api) + + bad_list = [ + # good (for testing test itself) + WebcaptureEntity(cdx=[ + WebcaptureEntityCdx( + surt="site,example,)/123.jpg", + url="http://example.site/123.jpg", + sha1="455face3598611458efe1f072e58624790a67266", + timestamp=201506071122)]), + ] + + for b in bad_list: + with pytest.raises(fatcat_client.rest.ApiException): + api.create_webcapture(b, editgroup_id=eg.editgroup_id) + diff --git a/python/tests/citation_efficiency.py b/python/tests/citation_efficiency.py new file mode 100644 index 00000000..fe5006cc --- /dev/null +++ b/python/tests/citation_efficiency.py @@ -0,0 +1,113 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_citation_indexing(api): + # indexing is consistent and reacts to change + + eg = quick_eg(api) + r1 = ReleaseEntity(title="the target") + r1.refs = [ + ReleaseRef(key="first", title="the first title"), + ReleaseRef(key="second", title="the second title"), + ReleaseRef(key="third", title="a third title"), + ] + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + assert r1.refs[0].index == 0 + assert r1.refs[0].key == "first" + assert r1.refs[1].index == 1 + assert r1.refs[1].key == "second" + assert r1.refs[2].index == 2 + assert r1.refs[2].key == "third" + + r1.refs.pop(1) + eg = quick_eg(api) + api.update_release(r1.ident, r1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r1 = api.get_release(r1.ident) + + assert r1.refs[0].index == 0 + assert r1.refs[0].key == "first" + assert r1.refs[1].index == 1 + assert r1.refs[1].key == "third" + +def test_citation_targets(api): + # invariant to linking citations + # also, updates work + + eg = quick_eg(api) + r1 = ReleaseEntity(title="the target") + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + r2 = ReleaseEntity(title="the citer") + r2.refs = [ + ReleaseRef(key="first", title="something else"), + ReleaseRef(key="second", title="the target title"), + ] + r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + eg = quick_eg(api) + r2.refs[1].target_release_id = r1.ident + api.update_release(r2.ident, r2, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r2 = api.get_release(r2.ident) + assert r2.refs[0].key == "first" + assert r2.refs[1].key == "second" + assert r2.refs[0].index == 0 # TODO: one-indexing? + assert r2.refs[1].index == 1 + assert r2.refs[0].target_release_id == None + assert r2.refs[1].target_release_id == r1.ident + assert len(r2.refs) == 2 + +def test_citation_empty_array(api): + # distinction between empty array (no citations) and no array (hidden) + + r1 = ReleaseEntity(title="citation null") + r2 = ReleaseEntity(title="citation empty array") + r1.refs = None + r2.refs = [] + + eg = quick_eg(api) + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + print(r1.refs) + print(r2.refs) + assert r1.refs == [] + assert r1.refs == r2.refs + + r1b = api.get_release(r1.ident, hide="refs") + assert r1b.refs == None + +def test_citation_encoding(api): + # escape-only changes (eg, \u1234 whatever for ASCII) + + r1 = ReleaseEntity(title="citation encoding") + title = "title-unicode \\u0050 \\\" " + container = "container-unicode ☃︎ ä ö ü スティー" + extra = extra={'a': 1, 'b': 2, 'ö': 3} + locator = "p123" + r1.refs = [ + ReleaseRef(key="1", year=1923, title=title, container_name=container, + extra=extra, locator=locator), + ReleaseRef(key="2"), + ] + + eg = quick_eg(api) + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + assert title == r1.refs[0].title + assert container == r1.refs[0].container_name + assert extra == r1.refs[0].extra + assert locator == r1.refs[0].locator + diff --git a/python/tests/cli.sh b/python/tests/cli.sh index eba6d3a7..19d8a85b 100755 --- a/python/tests/cli.sh +++ b/python/tests/cli.sh @@ -14,7 +14,7 @@ set -x ./fatcat_import.py crossref tests/files/crossref-works.2018-01-21.badsample.json tests/files/ISSN-to-ISSN-L.snip.txt ./fatcat_import.py orcid tests/files/0000-0001-8254-7103.json -./fatcat_import.py issn tests/files/journal_extra_metadata.snip.csv +./fatcat_import.py journal-metadata tests/files/journal_extra_metadata.snip.csv ./fatcat_import.py matched tests/files/matched_sample.json ./fatcat_import.py matched tests/files/example_matched.json ./fatcat_import.py grobid-metadata tests/files/example_grobid_metadata_lines.tsv diff --git a/python/tests/files/crossref-works.single.json b/python/tests/files/crossref-works.single.json index 2af2b358..e3d2e05c 100644 --- a/python/tests/files/crossref-works.single.json +++ b/python/tests/files/crossref-works.single.json @@ -84,7 +84,7 @@ { "given": "Carlos G.", "family": "Diaz", - "affiliation": ["Some University"] + "affiliation": [{"name": "Some University"}, {"name": "Some Department"}] }, { "given": "Francisco M.", diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index 6a880c48..3cc275b3 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -28,6 +28,7 @@ def api(): conf.api_key["Authorization"] = os.getenv("FATCAT_API_AUTH_TOKEN") conf.api_key_prefix["Authorization"] = "Bearer" api_client = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + api_client.editor_id = "aaaaaaaaaaaabkvkaaaaaaaaae" return api_client def test_get_changelog_entry(api): @@ -38,33 +39,6 @@ def test_get_changelog_entry(api): ## Helpers ################################################################## def quick_eg(api_inst): - eg = api_inst.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + eg = api_inst.create_editgroup(fatcat_client.Editgroup()) return eg -# TODO: what are these even here for? -def check_entity_fields(e): - for key in ('rev', 'is_live', 'redirect_id'): - assert key in e - for key in ('id',): - assert e[key] is not None - -def check_release(e): - for key in ('work', 'release_type'): - assert key in e - for key in ('title', ): - assert e[key] is not None - for key in ('refs', 'creators'): - assert type(e[key]) == list - -def check_creator(e): - for key in ('name',): - assert e[key] is not None - -def check_container(e): - for key in ('name',): - assert e[key] is not None - -def check_file(e): - for key in ('size', 'sha1'): - assert e[key] is not None diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index e2ca6122..193f78f6 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -1,35 +1,51 @@ import json import pytest -from fatcat_tools.importers import CrossrefImporter +from fatcat_tools.importers import CrossrefImporter, JsonLinePusher from fixtures import api @pytest.fixture(scope="function") def crossref_importer(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', check_existing=False) + yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True) @pytest.fixture(scope="function") def crossref_importer_existing(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', check_existing=True) + yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) def test_crossref_importer_batch(crossref_importer): with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - crossref_importer.process_batch(f) + JsonLinePusher(crossref_importer, f).run() def test_crossref_importer(crossref_importer): + last_index = crossref_importer.api.get_changelog(limit=1)[0].index with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - crossref_importer.process_source(f) + crossref_importer.bezerk_mode = True + counts = JsonLinePusher(crossref_importer, f).run() + assert counts['insert'] == 14 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + # fetch most recent editgroup - changes = crossref_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup + change = crossref_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup assert eg.description assert "crossref" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.CrossrefImporter" in eg.extra['agent'] + last_index = crossref_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: + crossref_importer.bezerk_mode = False + crossref_importer.reset() + counts = JsonLinePusher(crossref_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 14 + assert counts['skip'] == 0 + assert last_index == crossref_importer.api.get_changelog(limit=1)[0].index + def test_crossref_mappings(crossref_importer): assert crossref_importer.map_release_type('journal-article') == "article-journal" assert crossref_importer.map_release_type('asdf') is None @@ -39,13 +55,13 @@ def test_crossref_mappings(crossref_importer): def test_crossref_importer_create(crossref_importer): crossref_importer.create_containers = True with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - crossref_importer.process_source(f) + JsonLinePusher(crossref_importer, f).run() def test_crossref_dict_parse(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: # not a single line raw = json.loads(f.read()) - (r, c) = crossref_importer.parse_crossref_dict(raw) + r = crossref_importer.parse_record(raw) extra = r.extra['crossref'] assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators" assert r.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t" @@ -61,7 +77,8 @@ def test_crossref_dict_parse(crossref_importer): assert len(r.contribs) == 5 assert r.contribs[0].raw_name == "Marcelo D. Radicioni" assert r.contribs[0].index == 0 - assert r.contribs[1].extra['affiliations'] == ["Some University"] + assert r.contribs[1].raw_affiliation == "Some University" + assert r.contribs[1].extra['more_affiliations'] == ["Some Department"] assert r.contribs[1].role == "author" assert r.contribs[3].role == "editor" assert r.contribs[3].index is None @@ -78,8 +95,10 @@ def test_crossref_dict_parse(crossref_importer): def test_stateful_checking(crossref_importer_existing): with open('tests/files/crossref-works.single.json', 'r') as f: # not a single line, a whole document - raw = json.loads(f.read()) + raw = f.read() # might not exist yet... - crossref_importer_existing.process_source([json.dumps(raw)]) - # ok, make sure we get 'None' back - assert crossref_importer_existing.parse_crossref_dict(raw) is None + crossref_importer_existing.push_record(json.loads(raw)) + crossref_importer_existing.finish() + # make sure we wouldn't insert again + entity = crossref_importer_existing.parse_record(json.loads(raw)) + assert crossref_importer_existing.try_update(entity) is False diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py index 97ebcaef..4fed4aaa 100644 --- a/python/tests/import_grobid_metadata.py +++ b/python/tests/import_grobid_metadata.py @@ -3,7 +3,7 @@ import os import json import base64 import pytest -from fatcat_tools.importers import GrobidMetadataImporter +from fatcat_tools.importers import GrobidMetadataImporter, LinePusher from fixtures import api """ @@ -15,10 +15,6 @@ side-effects. Should probably be disabled or re-written. def grobid_metadata_importer(api): yield GrobidMetadataImporter(api) -# TODO: use API to check that entities actually created... -#def test_grobid_metadata_importer_batch(grobid_metadata_importer): -# with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: -# grobid_metadata_importer.process_batch(f) def test_grobid_metadata_parse(grobid_metadata_importer): with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: @@ -30,7 +26,8 @@ def test_grobid_metadata_parse(grobid_metadata_importer): print(re.contribs) assert re.contribs[0].raw_name == "Wahyu Ary" assert re.publisher == None - assert re.extra.get('container_name') == None + if re.extra: + assert re.extra.get('container_name') == None assert len(re.refs) == 27 def test_file_metadata_parse(grobid_metadata_importer): @@ -53,13 +50,28 @@ def test_file_metadata_parse(grobid_metadata_importer): assert len(fe.release_ids) == 0 def test_grobid_metadata_importer(grobid_metadata_importer): + last_index = grobid_metadata_importer.api.get_changelog(limit=1)[0].index with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: - grobid_metadata_importer.process_source(f) + grobid_metadata_importer.bezerk_mode = True + counts = LinePusher(grobid_metadata_importer, f).run() + assert counts['insert'] == 10 + assert counts['inserted.release'] == 10 + assert counts['exists'] == 0 + assert counts['skip'] == 0 # fetch most recent editgroup - changes = grobid_metadata_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup + change = grobid_metadata_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup assert eg.description assert "grobid" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.GrobidMetadataImporter" in eg.extra['agent'] + + with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: + grobid_metadata_importer.reset() + grobid_metadata_importer.bezerk_mode = False + counts = LinePusher(grobid_metadata_importer, f).run() + assert counts['insert'] == 0 + assert counts['inserted.release'] == 0 + assert counts['exists'] == 10 + assert counts['skip'] == 0 diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py deleted file mode 100644 index 6b5978d9..00000000 --- a/python/tests/import_issn.py +++ /dev/null @@ -1,26 +0,0 @@ - -import pytest -from fatcat_tools.importers import IssnImporter -from fixtures import api - - -@pytest.fixture(scope="function") -def issn_importer(api): - yield IssnImporter(api) - -# TODO: use API to check that entities actually created... -def test_issn_importer_batch(issn_importer): - with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: - issn_importer.process_csv_batch(f) - -def test_issn_importer(issn_importer): - with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: - issn_importer.process_csv_source(f) - - # fetch most recent editgroup - changes = issn_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup - assert eg.description - assert "container" in eg.description.lower() - assert eg.extra['git_rev'] - assert "fatcat_tools.IssnImporter" in eg.extra['agent'] diff --git a/python/tests/import_journal_metadata.py b/python/tests/import_journal_metadata.py new file mode 100644 index 00000000..a2b10a65 --- /dev/null +++ b/python/tests/import_journal_metadata.py @@ -0,0 +1,39 @@ + +import pytest +from fatcat_tools.importers import JournalMetadataImporter, CsvPusher +from fixtures import api + + +@pytest.fixture(scope="function") +def journal_metadata_importer(api): + yield JournalMetadataImporter(api) + +# TODO: use API to check that entities actually created... +def test_journal_metadata_importer_batch(journal_metadata_importer): + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + CsvPusher(journal_metadata_importer, f).run() + +def test_journal_metadata_importer(journal_metadata_importer): + last_index = journal_metadata_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + journal_metadata_importer.bezerk_mode = True + counts = CsvPusher(journal_metadata_importer, f).run() + assert counts['insert'] == 9 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = journal_metadata_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "container" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.JournalMetadataImporter" in eg.extra['agent'] + + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + journal_metadata_importer.reset() + journal_metadata_importer.bezerk_mode = False + counts = CsvPusher(journal_metadata_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 9 + assert counts['skip'] == 0 diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py index 080674ac..8f694456 100644 --- a/python/tests/import_matched.py +++ b/python/tests/import_matched.py @@ -1,7 +1,7 @@ import json import pytest -from fatcat_tools.importers import MatchedImporter +from fatcat_tools.importers import MatchedImporter, JsonLinePusher from fixtures import api @@ -10,26 +10,40 @@ def matched_importer(api): yield MatchedImporter(api) # TODO: use API to check that entities actually created... -def test_matched_importer_batch(matched_importer): +def test_matched_importer(matched_importer): with open('tests/files/example_matched.json', 'r') as f: - matched_importer.process_batch(f) + JsonLinePusher(matched_importer, f).run() def test_matched_importer(matched_importer): + last_index = matched_importer.api.get_changelog(limit=1)[0].index with open('tests/files/example_matched.json', 'r') as f: - matched_importer.process_source(f) + matched_importer.bezerk_mode = True + counts = JsonLinePusher(matched_importer, f).run() + assert counts['insert'] == 2 + assert counts['exists'] == 0 + assert counts['skip'] == 11 # fetch most recent editgroup - changes = matched_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup + change = matched_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup assert eg.description assert "file-to-release" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.MatchedImporter" in eg.extra['agent'] + # re-insert; should skip + with open('tests/files/example_matched.json', 'r') as f: + matched_importer.reset() + matched_importer.bezerk_mode = False + counts = JsonLinePusher(matched_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 2 + assert counts['skip'] == 11 + def test_matched_dict_parse(matched_importer): with open('tests/files/example_matched.json', 'r') as f: raw = json.loads(f.readline()) - f = matched_importer.parse_matched_dict(raw) + f = matched_importer.parse_record(raw) assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313" assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff" assert f.mimetype == "application/pdf" diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py index 717a1328..57886b52 100644 --- a/python/tests/import_orcid.py +++ b/python/tests/import_orcid.py @@ -1,7 +1,7 @@ import json import pytest -from fatcat_tools.importers import OrcidImporter +from fatcat_tools.importers import OrcidImporter, JsonLinePusher from fixtures import api @@ -9,37 +9,46 @@ from fixtures import api def orcid_importer(api): yield OrcidImporter(api) -# TODO: use API to check that entities actually created... -def test_orcid_importer_batch(orcid_importer): - with open('tests/files/0000-0001-8254-7103.json', 'r') as f: - orcid_importer.process_batch(f) - def test_orcid_importer_badid(orcid_importer): with open('tests/files/0000-0001-8254-710X.json', 'r') as f: - orcid_importer.process_batch(f) + JsonLinePusher(orcid_importer, f).run() +# TODO: use API to check that entities actually created... def test_orcid_importer(orcid_importer): + last_index = orcid_importer.api.get_changelog(limit=1)[0].index with open('tests/files/0000-0001-8254-7103.json', 'r') as f: - orcid_importer.process_source(f) + orcid_importer.bezerk_mode = True + counts = JsonLinePusher(orcid_importer, f).run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 # fetch most recent editgroup - changes = orcid_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup + change = orcid_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup assert eg.description assert "orcid" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.OrcidImporter" in eg.extra['agent'] + with open('tests/files/0000-0001-8254-7103.json', 'r') as f: + orcid_importer.reset() + orcid_importer.bezerk_mode = False + counts = JsonLinePusher(orcid_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + def test_orcid_importer_x(orcid_importer): with open('tests/files/0000-0003-3953-765X.json', 'r') as f: - orcid_importer.process_source(f) + JsonLinePusher(orcid_importer, f).run() c = orcid_importer.api.lookup_creator(orcid="0000-0003-3953-765X") assert c is not None def test_orcid_dict_parse(orcid_importer): with open('tests/files/0000-0001-8254-7103.json', 'r') as f: raw = json.loads(f.readline()) - c = orcid_importer.parse_orcid_dict(raw) + c = orcid_importer.parse_record(raw) assert c.given_name == "Man-Hui" assert c.surname == "Li" assert c.display_name == "Man-Hui Li" diff --git a/python/tests/importer.py b/python/tests/importer.py index 34efa5d8..9308ba84 100644 --- a/python/tests/importer.py +++ b/python/tests/importer.py @@ -1,13 +1,13 @@ import pytest -from fatcat_tools.importers import FatcatImporter +from fatcat_tools.importers import CrossrefImporter, OrcidImporter from fixtures import api def test_issnl_mapping_lookup(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - fi = FatcatImporter(api, issn_map_file=issn_file) + fi = CrossrefImporter(api, issn_map_file=issn_file) assert fi.issn2issnl('0000-0027') == '0002-0027' assert fi.issn2issnl('0002-0027') == '0002-0027' @@ -18,20 +18,18 @@ def test_issnl_mapping_lookup(api): def test_identifiers(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - fi = FatcatImporter(api, issn_map_file=issn_file) - - assert fi.is_issnl("1234-5678") == True - assert fi.is_issnl("1234-5678.") == False - assert fi.is_issnl("12345678") == False - assert fi.is_issnl("1-2345678") == False - - assert fi.is_doi("10.1234/56789") == True - assert fi.is_doi("101234/56789") == False - assert fi.is_doi("10.1234_56789") == False - - assert fi.is_orcid("0000-0003-3118-6591") == True - assert fi.is_orcid("0000-0003-3953-765X") == True - assert fi.is_orcid("0000-00x3-3118-659") == False - assert fi.is_orcid("0000-00033118-659") == False - assert fi.is_orcid("0000-0003-3118-659.") == False + ci = CrossrefImporter(api, issn_map_file=issn_file) + + assert ci.is_issnl("1234-5678") == True + assert ci.is_issnl("1234-5678.") == False + assert ci.is_issnl("12345678") == False + assert ci.is_issnl("1-2345678") == False + + oi = OrcidImporter(api) + + assert oi.is_orcid("0000-0003-3118-6591") == True + assert oi.is_orcid("0000-0003-3953-765X") == True + assert oi.is_orcid("0000-00x3-3118-659") == False + assert oi.is_orcid("0000-00033118-659") == False + assert oi.is_orcid("0000-0003-3118-659.") == False diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py index e9d23250..6d6c6c82 100644 --- a/python/tests/transform_tests.py +++ b/python/tests/transform_tests.py @@ -11,7 +11,7 @@ def test_elasticsearch_convert(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: # not a single line raw = json.loads(f.read()) - (r, c) = crossref_importer.parse_crossref_dict(raw) + r = crossref_importer.parse_record(raw) r.state = 'active' release_to_elasticsearch(r) |