summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/Pipfile1
-rw-r--r--python/Pipfile.lock189
-rw-r--r--python/README_import.md9
-rw-r--r--python/env.example8
-rwxr-xr-xpython/fatcat_import.py78
-rw-r--r--python/fatcat_tools/importers/__init__.py19
-rw-r--r--python/fatcat_tools/importers/common.py390
-rw-r--r--python/fatcat_tools/importers/crossref.py263
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py123
-rw-r--r--python/fatcat_tools/importers/issn.py89
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py183
-rw-r--r--python/fatcat_tools/importers/matched.py150
-rw-r--r--python/fatcat_tools/importers/orcid.py51
-rw-r--r--python/fatcat_tools/transforms.py130
-rw-r--r--python/fatcat_tools/workers/changelog.py2
-rw-r--r--python/fatcat_web/auth.py5
-rw-r--r--python/fatcat_web/routes.py4
-rw-r--r--python/fatcat_web/templates/container_view.html6
-rw-r--r--python/fatcat_web/templates/release_view.html2
-rw-r--r--python/fatcat_web/web_config.py11
-rw-r--r--python/tests/api_annotations.py39
-rw-r--r--python/tests/api_containers.py48
-rw-r--r--python/tests/api_creators.py44
-rw-r--r--python/tests/api_editgroups.py140
-rw-r--r--python/tests/api_files.py52
-rw-r--r--python/tests/api_filesets.py79
-rw-r--r--python/tests/api_misc.py8
-rw-r--r--python/tests/api_releases.py103
-rw-r--r--python/tests/api_webcaptures.py96
-rw-r--r--python/tests/citation_efficiency.py113
-rwxr-xr-xpython/tests/cli.sh2
-rw-r--r--python/tests/files/crossref-works.single.json2
-rw-r--r--python/tests/fixtures.py30
-rw-r--r--python/tests/import_crossref.py47
-rw-r--r--python/tests/import_grobid_metadata.py30
-rw-r--r--python/tests/import_issn.py26
-rw-r--r--python/tests/import_journal_metadata.py39
-rw-r--r--python/tests/import_matched.py28
-rw-r--r--python/tests/import_orcid.py33
-rw-r--r--python/tests/importer.py34
-rw-r--r--python/tests/transform_tests.py2
41 files changed, 1966 insertions, 742 deletions
diff --git a/python/Pipfile b/python/Pipfile
index eebdab36..b04bb91a 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -32,6 +32,7 @@ python-dateutil = "*"
sickle = "*"
python-snappy = "*"
pymacaroons = "*"
+ftfy= "*"
[requires]
# Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 296079f0..f2d39a99 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "c3deb49cf4c122c2aed3f4f944e9763cfcf40c85891ca3d3e9cabc3debbb9075"
+ "sha256": "8f98bb3f6a3083c8b03cb68d1ee48b25449a950dd8a9d15189f2eb4fae48f760"
},
"pipfile-spec": 6,
"requires": {
@@ -96,27 +96,27 @@
},
"cryptography": {
"hashes": [
- "sha256:05a6052c6a9f17ff78ba78f8e6eb1d777d25db3b763343a1ae89a7a8670386dd",
- "sha256:0eb83a24c650a36f68e31a6d0a70f7ad9c358fa2506dc7b683398b92e354a038",
- "sha256:0ff4a3d6ea86aa0c9e06e92a9f986de7ee8231f36c4da1b31c61a7e692ef3378",
- "sha256:1699f3e916981df32afdd014fb3164db28cdb61c757029f502cb0a8c29b2fdb3",
- "sha256:1b1f136d74f411f587b07c076149c4436a169dc19532e587460d9ced24adcc13",
- "sha256:21e63dd20f5e5455e8b34179ac43d95b3fb1ffa54d071fd2ed5d67da82cfe6dc",
- "sha256:2454ada8209bbde97065453a6ca488884bbb263e623d35ba183821317a58b46f",
- "sha256:3cdc5f7ca057b2214ce4569e01b0f368b3de9d8ee01887557755ccd1c15d9427",
- "sha256:418e7a5ec02a7056d3a4f0c0e7ea81df374205f25f4720bb0e84189aa5fd2515",
- "sha256:471a097076a7c4ab85561d7fa9a1239bd2ae1f9fd0047520f13d8b340bf3210b",
- "sha256:5ecaf9e7db3ca582c6de6229525d35db8a4e59dc3e8a40a331674ed90e658cbf",
- "sha256:63b064a074f8dc61be81449796e2c3f4e308b6eba04a241a5c9f2d05e882c681",
- "sha256:6afe324dfe6074822ccd56d80420df750e19ac30a4e56c925746c735cf22ae8b",
- "sha256:70596e90398574b77929cd87e1ac6e43edd0e29ba01e1365fed9c26bde295aa5",
- "sha256:70c2b04e905d3f72e2ba12c58a590817128dfca08949173faa19a42c824efa0b",
- "sha256:8908f1db90be48b060888e9c96a0dee9d842765ce9594ff6a23da61086116bb6",
- "sha256:af12dfc9874ac27ebe57fc28c8df0e8afa11f2a1025566476b0d50cdb8884f70",
- "sha256:b4fc04326b2d259ddd59ed8ea20405d2e695486ab4c5e1e49b025c484845206e",
- "sha256:da5b5dda4aa0d5e2b758cc8dfc67f8d4212e88ea9caad5f61ba132f948bab859"
- ],
- "version": "==2.4.2"
+ "sha256:05b3ded5e88747d28ee3ef493f2b92cbb947c1e45cf98cfef22e6d38bb67d4af",
+ "sha256:06826e7f72d1770e186e9c90e76b4f84d90cdb917b47ff88d8dc59a7b10e2b1e",
+ "sha256:08b753df3672b7066e74376f42ce8fc4683e4fd1358d34c80f502e939ee944d2",
+ "sha256:2cd29bd1911782baaee890544c653bb03ec7d95ebeb144d714b0f5c33deb55c7",
+ "sha256:31e5637e9036d966824edaa91bf0aa39dc6f525a1c599f39fd5c50340264e079",
+ "sha256:42fad67d7072216a49e34f923d8cbda9edacbf6633b19a79655e88a1b4857063",
+ "sha256:4946b67235b9d2ea7d31307be9d5ad5959d6c4a8f98f900157b47abddf698401",
+ "sha256:522fdb2809603ee97a4d0ef2f8d617bc791eb483313ba307cb9c0a773e5e5695",
+ "sha256:6f841c7272645dd7c65b07b7108adfa8af0aaea57f27b7f59e01d41f75444c85",
+ "sha256:7d335e35306af5b9bc0560ca39f740dfc8def72749645e193dd35be11fb323b3",
+ "sha256:8504661ffe324837f5c4607347eeee4cf0fcad689163c6e9c8d3b18cf1f4a4ad",
+ "sha256:9260b201ce584d7825d900c88700aa0bd6b40d4ebac7b213857bd2babee9dbca",
+ "sha256:9a30384cc402eac099210ab9b8801b2ae21e591831253883decdb4513b77a3cd",
+ "sha256:9e29af877c29338f0cab5f049ccc8bd3ead289a557f144376c4fbc7d1b98914f",
+ "sha256:ab50da871bc109b2d9389259aac269dd1b7c7413ee02d06fe4e486ed26882159",
+ "sha256:b13c80b877e73bcb6f012813c6f4a9334fcf4b0e96681c5a15dac578f2eedfa0",
+ "sha256:bfe66b577a7118e05b04141f0f1ed0959552d45672aa7ecb3d91e319d846001e",
+ "sha256:e091bd424567efa4b9d94287a952597c05d22155a13716bf5f9f746b9dc906d3",
+ "sha256:fa2b38c8519c5a3aa6e2b4e1cf1a549b54acda6adb25397ff542068e73d1ed00"
+ ],
+ "version": "==2.5"
},
"fatcat-client": {
"editable": true,
@@ -152,6 +152,14 @@
"index": "pypi",
"version": "==0.2"
},
+ "ftfy": {
+ "hashes": [
+ "sha256:84a1614190173bb447ac9d581e50185c6aa35b538754b6bedaba0cc0f83d8e80",
+ "sha256:fa74757fb7cb444366fa6a79c2feabd40281a44dfbf6eaed492a804764ee26b2"
+ ],
+ "index": "pypi",
+ "version": "==5.5.1"
+ },
"idna": {
"hashes": [
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
@@ -366,6 +374,13 @@
],
"version": "==1.24.1"
},
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ },
"werkzeug": {
"hashes": [
"sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c",
@@ -558,10 +573,10 @@
},
"parso": {
"hashes": [
- "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2",
- "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24"
+ "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d",
+ "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31"
],
- "version": "==0.3.1"
+ "version": "==0.3.2"
},
"pathlib2": {
"hashes": [
@@ -595,10 +610,10 @@
},
"pluggy": {
"hashes": [
- "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095",
- "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f"
+ "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
+ "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
],
- "version": "==0.8.0"
+ "version": "==0.8.1"
},
"prompt-toolkit": {
"hashes": [
@@ -610,38 +625,38 @@
},
"psycopg2": {
"hashes": [
- "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e",
- "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237",
- "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f",
- "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300",
- "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076",
- "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221",
- "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82",
- "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92",
- "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3",
- "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6",
- "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e",
- "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae",
- "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535",
- "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f",
- "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26",
- "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7",
- "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53",
- "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b",
- "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc",
- "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee",
- "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20",
- "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d",
- "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9",
- "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244",
- "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f",
- "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b",
- "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370",
- "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9",
- "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d",
- "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0"
- ],
- "version": "==2.7.6.1"
+ "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94",
+ "sha256:0e9873e60f98f0c52339abf8f0339d1e22bfe5aae0bcf7aabd40c055175035ec",
+ "sha256:1148a5eb29073280bf9057c7fc45468592c1bb75a28f6df1591adb93c8cb63d0",
+ "sha256:259a8324e109d4922b0fcd046e223e289830e2568d6f4132a3702439e5fd532b",
+ "sha256:28dffa9ed4595429e61bacac41d3f9671bb613d1442ff43bcbec63d4f73ed5e8",
+ "sha256:314a74302d4737a3865d40ea50e430ce1543c921ba10f39d562e807cfe2edf2a",
+ "sha256:36b60201b6d215d7658a71493fdf6bd5e60ad9a0cffed39906627ff9f4f3afd3",
+ "sha256:3f9d532bce54c4234161176ff3b8688ff337575ca441ea27597e112dfcd0ee0c",
+ "sha256:5d222983847b40af989ad96c07fc3f07e47925e463baa5de716be8f805b41d9b",
+ "sha256:6757a6d2fc58f7d8f5d471ad180a0bd7b4dd3c7d681f051504fbea7ae29c8d6f",
+ "sha256:6a0e0f1e74edb0ab57d89680e59e7bfefad2bfbdf7c80eb38304d897d43674bb",
+ "sha256:6ca703ccdf734e886a1cf53eb702261110f6a8b0ed74bcad15f1399f74d3f189",
+ "sha256:8513b953d8f443c446aa79a4cc8a898bd415fc5e29349054f03a7d696d495542",
+ "sha256:9262a5ce2038570cb81b4d6413720484cb1bc52c064b2f36228d735b1f98b794",
+ "sha256:97441f851d862a0c844d981cbee7ee62566c322ebb3d68f86d66aa99d483985b",
+ "sha256:a07feade155eb8e69b54dd6774cf6acf2d936660c61d8123b8b6b1f9247b67d6",
+ "sha256:a9b9c02c91b1e3ec1f1886b2d0a90a0ea07cc529cb7e6e472b556bc20ce658f3",
+ "sha256:ae88216f94728d691b945983140bf40d51a1ff6c7fe57def93949bf9339ed54a",
+ "sha256:b360ffd17659491f1a6ad7c928350e229c7b7bd83a2b922b6ee541245c7a776f",
+ "sha256:b4221957ceccf14b2abdabef42d806e791350be10e21b260d7c9ce49012cc19e",
+ "sha256:b90758e49d5e6b152a460d10b92f8a6ccf318fcc0ee814dcf53f3a6fc5328789",
+ "sha256:c669ea986190ed05fb289d0c100cc88064351f2b85177cbfd3564c4f4847d18c",
+ "sha256:d1b61999d15c79cf7f4f7cc9021477aef35277fc52452cf50fd13b713c84424d",
+ "sha256:de7bb043d1adaaf46e38d47e7a5f703bb3dab01376111e522b07d25e1a79c1e1",
+ "sha256:e393568e288d884b94d263f2669215197840d097c7e5b0acd1a51c1ea7d1aba8",
+ "sha256:ed7e0849337bd37d89f2c2b0216a0de863399ee5d363d31b1e5330a99044737b",
+ "sha256:f153f71c3164665d269a5d03c7fa76ba675c7a8de9dc09a4e2c2cdc9936a7b41",
+ "sha256:f1fb5a8427af099beb7f65093cbdb52e021b8e6dbdfaf020402a623f4181baf5",
+ "sha256:f36b333e9f86a2fba960c72b90c34be6ca71819e300f7b1fc3d2b0f0b2c546cd",
+ "sha256:f4526d078aedd5187d0508aa5f9a01eae6a48a470ed678406da94b4cd6524b7e"
+ ],
+ "version": "==2.7.7"
},
"ptyprocess": {
"hashes": [
@@ -674,11 +689,11 @@
},
"pytest": {
"hashes": [
- "sha256:3e65a22eb0d4f1bdbc1eacccf4a3198bf8d4049dea5112d70a0c61b00e748d02",
- "sha256:5924060b374f62608a078494b909d341720a050b5224ff87e17e12377486a71d"
+ "sha256:41568ea7ecb4a68d7f63837cf65b92ce8d0105e43196ff2b26622995bb3dc4b2",
+ "sha256:c3c573a29d7c9547fb90217ece8a8843aa0c1328a797e200290dc3d0b4b823be"
],
"index": "pypi",
- "version": "==4.1.0"
+ "version": "==4.1.1"
},
"pytest-cov": {
"hashes": [
@@ -727,30 +742,30 @@
},
"typed-ast": {
"hashes": [
- "sha256:0555eca1671ebe09eb5f2176723826f6f44cca5060502fea259de9b0e893ab53",
- "sha256:0ca96128ea66163aea13911c9b4b661cb345eb729a20be15c034271360fc7474",
- "sha256:16ccd06d614cf81b96de42a37679af12526ea25a208bce3da2d9226f44563868",
- "sha256:1e21ae7b49a3f744958ffad1737dfbdb43e1137503ccc59f4e32c4ac33b0bd1c",
- "sha256:37670c6fd857b5eb68aa5d193e14098354783b5138de482afa401cc2644f5a7f",
- "sha256:46d84c8e3806619ece595aaf4f37743083f9454c9ea68a517f1daa05126daf1d",
- "sha256:5b972bbb3819ece283a67358103cc6671da3646397b06e7acea558444daf54b2",
- "sha256:6306ffa64922a7b58ee2e8d6f207813460ca5a90213b4a400c2e730375049246",
- "sha256:6cb25dc95078931ecbd6cbcc4178d1b8ae8f2b513ae9c3bd0b7f81c2191db4c6",
- "sha256:7e19d439fee23620dea6468d85bfe529b873dace39b7e5b0c82c7099681f8a22",
- "sha256:7f5cd83af6b3ca9757e1127d852f497d11c7b09b4716c355acfbebf783d028da",
- "sha256:81e885a713e06faeef37223a5b1167615db87f947ecc73f815b9d1bbd6b585be",
- "sha256:94af325c9fe354019a29f9016277c547ad5d8a2d98a02806f27a7436b2da6735",
- "sha256:b1e5445c6075f509d5764b84ce641a1535748801253b97f3b7ea9d948a22853a",
- "sha256:cb061a959fec9a514d243831c514b51ccb940b58a5ce572a4e209810f2507dcf",
- "sha256:cc8d0b703d573cbabe0d51c9d68ab68df42a81409e4ed6af45a04a95484b96a5",
- "sha256:da0afa955865920edb146926455ec49da20965389982f91e926389666f5cf86a",
- "sha256:dc76738331d61818ce0b90647aedde17bbba3d3f9e969d83c1d9087b4f978862",
- "sha256:e7ec9a1445d27dbd0446568035f7106fa899a36f55e52ade28020f7b3845180d",
- "sha256:f741ba03feb480061ab91a465d1a3ed2d40b52822ada5b4017770dfcb88f839f",
- "sha256:fe800a58547dd424cd286b7270b967b5b3316b993d86453ede184a17b5a6b17d"
+ "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe",
+ "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c",
+ "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2",
+ "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a",
+ "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7",
+ "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827",
+ "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33",
+ "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9",
+ "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032",
+ "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9",
+ "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2",
+ "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2",
+ "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062",
+ "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15",
+ "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357",
+ "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a",
+ "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824",
+ "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442",
+ "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1",
+ "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2",
+ "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6"
],
"markers": "python_version < '3.7' and implementation_name == 'cpython'",
- "version": "==1.1.1"
+ "version": "==1.2.0"
},
"urllib3": {
"hashes": [
@@ -768,9 +783,9 @@
},
"wrapt": {
"hashes": [
- "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
+ "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533"
],
- "version": "==1.10.11"
+ "version": "==1.11.1"
}
}
}
diff --git a/python/README_import.md b/python/README_import.md
index cc9a94e1..2465940b 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -26,11 +26,13 @@ the others:
wget https://archive.org/download/ia_papers_manifest_2018-01-25/index/idents_files_urls.sqlite.gz
wget https://archive.org/download/ia_journal_metadata_explore_2018-04-05/journal_extra_metadata.csv
wget https://archive.org/download/issn_issnl_mappings/20180216.ISSN-to-ISSN-L.txt
- wget https://archive.org/download/orcid-dump-2017/public_profiles_API-2.0_2017_10_json.tar.gz
+ wget https://archive.org/download/orcid-dump-2017/public_profiles_1_2_json.all.json.gz
wget https://archive.org/download/ia_journal_pid_map_munge_20180908/release_ids.ia_munge_20180908.sqlite3.gz
wget https://archive.org/download/ia_test_paper_matches/2018-08-27-2352.17-matchcrossref.insertable.json.gz
wget https://archive.org/download/ia_papers_manifest_2018-01-25_matched/ia_papers_manifest_2018-01-25.matched.json.gz
+ gunzip public_profiles_1_2_json.all.json.gz
+
## ISSN
From CSV file:
@@ -54,13 +56,14 @@ Usually 24 hours or so on fast production machine.
## Matched
-Unknown speed!
+These each take 2-4 hours:
# No file update for the first import...
- zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates -
+ time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates -
# ... but do on the second
zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -
# GROBID extracted (release+file)
time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata -
+
diff --git a/python/env.example b/python/env.example
index c986b9d2..75fc5238 100644
--- a/python/env.example
+++ b/python/env.example
@@ -1,4 +1,5 @@
-FLASK_SECRET_KEY=""
+FLASK_SECRET_KEY="TODO-REPLACE-ME"
+FATCAT_DOMAIN="dev.fatcat.wiki"
# This key used in tests
FATCAT_API_AUTH_TOKEN="AgEPZGV2LmZhdGNhdC53aWtpAhYyMDE5MDEwMS1kZXYtZHVtbXkta2V5AAImZWRpdG9yX2lkID0gYWFhYWFhYWFhYWFhYmt2a2FhYWFhYWFhYWkAAht0aW1lID4gMjAxOS0wMS0wOVQwMDo1Nzo1MloAAAYgnroNha1hSftChtxHGTnLEmM/pY8MeQS/jBSV0UNvXug="
FATCAT_API_HOST="http://localhost:9411/v0"
@@ -14,6 +15,5 @@ SENTRY_DSN=""
# FATCAT_API_AUTH_TOKEN
FATCAT_AUTH_WORKER_CROSSREF=""
FATCAT_AUTH_WORKER_ORCID=""
-FATCAT_AUTH_WORKER_ISSN=""
-FATCAT_AUTH_WORKER_MATCHED=""
-FATCAT_AUTH_WORKER_GROBID_METADATA=""
+FATCAT_AUTH_WORKER_PUBMED=""
+FATCAT_AUTH_WORKER_DATACITE=""
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 0e176b2c..a47aa175 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -1,47 +1,40 @@
#!/usr/bin/env python3
-"""
-"""
-
import os, sys, argparse
from fatcat_tools import authenticated_api
-from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \
- IssnImporter, MatchedImporter, GrobidMetadataImporter, make_kafka_consumer
+from fatcat_tools.importers import *
def run_crossref(args):
- fci = CrossrefImporter(args.api, args.issn_map_file,
+ fci = CrossrefImporter(args.api,
+ args.issn_map_file,
extid_map_file=args.extid_map_file,
- create_containers=(not args.no_create_containers),
- check_existing=(not args.no_release_updates))
+ edit_batch_size=args.batch_size,
+ bezerk_mode=args.bezerk_mode)
if args.kafka_mode:
- consumer = make_kafka_consumer(
- args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import")
- fci.process_batch(consumer, size=args.batch_size, decode_kafka=True)
+ KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import").run()
else:
- fci.process_batch(args.json_file, size=args.batch_size)
- fci.describe_run()
+ JsonLinePusher(fci).run()
def run_orcid(args):
- foi = OrcidImporter(args.api)
- foi.process_batch(args.json_file, size=args.batch_size)
- foi.describe_run()
+ foi = OrcidImporter(args.api,
+ edit_batch_size=args.batch_size)
+ JsonLinePusher(foi, args.json_file).run()
-def run_issn(args):
- fii = IssnImporter(args.api)
- fii.process_csv_batch(args.csv_file, size=args.batch_size)
- fii.describe_run()
+def run_journal_metadata(args):
+ fii = JournalMetadataImporter(args.api,
+ edit_batch_size=args.batch_size)
+ CsvLinePusher(fii, args.csv_file).run()
def run_matched(args):
fmi = MatchedImporter(args.api,
- skip_file_updates=args.no_file_updates)
- fmi.process_batch(args.json_file, size=args.batch_size)
- fmi.describe_run()
+ bezerk_mode=args.bezerk_mode,
+ edit_batch_size=args.batch_size)
+ JsonLinePusher(fmi, args.json_file).run()
def run_grobid_metadata(args):
- fmi = GrobidMetadataImporter(args.api)
- fmi.process_source(args.tsv_file, group_size=args.group_size)
- fmi.describe_run()
+ fmi = GrobidMetadataImporter(args.api, edit_batch_size=args.batch_size, longtail_oa=args.longtail_oa)
+ LinePusher(fmi, args.tsv_file).run()
def main():
parser = argparse.ArgumentParser()
@@ -73,18 +66,15 @@ def main():
sub_crossref.add_argument('--extid-map-file',
help="DOI-to-other-identifiers sqlite3 database",
default=None, type=str)
- sub_crossref.add_argument('--no-create-containers',
- action='store_true',
- help="skip creation of new container entities based on ISSN")
sub_crossref.add_argument('--batch-size',
help="size of batch to send",
default=50, type=int)
sub_crossref.add_argument('--kafka-mode',
action='store_true',
help="consume from kafka topic (not stdin)")
- sub_crossref.add_argument('--no-release-updates',
+ sub_crossref.add_argument('--bezerk-mode',
action='store_true',
- help="don't lookup existing DOIs, just insert (only for bootstrap)")
+ help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
sub_orcid = subparsers.add_parser('orcid')
sub_orcid.set_defaults(
@@ -98,37 +88,37 @@ def main():
help="size of batch to send",
default=50, type=int)
- sub_issn = subparsers.add_parser('issn')
- sub_issn.set_defaults(
- func=run_issn,
- auth_var="FATCAT_AUTH_WORKER_ISSN",
+ sub_journal_metadata = subparsers.add_parser('journal-metadata')
+ sub_journal_metadata.set_defaults(
+ func=run_journal_metadata,
+ auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
)
- sub_issn.add_argument('csv_file',
+ sub_journal_metadata.add_argument('csv_file',
help="Journal ISSN CSV metadata file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_issn.add_argument('--batch-size',
+ sub_journal_metadata.add_argument('--batch-size',
help="size of batch to send",
default=50, type=int)
sub_matched = subparsers.add_parser('matched')
sub_matched.set_defaults(
func=run_matched,
- auth_var="FATCAT_AUTH_WORKER_MATCHED",
+ auth_var="FATCAT_API_AUTH_TOKEN",
)
sub_matched.add_argument('json_file',
help="JSON file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_matched.add_argument('--no-file-updates',
- action='store_true',
- help="don't lookup existing files, just insert (only for bootstrap)")
sub_matched.add_argument('--batch-size',
help="size of batch to send",
default=50, type=int)
+ sub_matched.add_argument('--bezerk-mode',
+ action='store_true',
+ help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
sub_grobid_metadata.set_defaults(
func=run_grobid_metadata,
- auth_var="FATCAT_AUTH_WORKER_GROBID_METADATA",
+ auth_var="FATCAT_API_AUTH_TOKEN",
)
sub_grobid_metadata.add_argument('tsv_file',
help="TSV file to import from (or stdin)",
@@ -136,6 +126,9 @@ def main():
sub_grobid_metadata.add_argument('--group-size',
help="editgroup group size to use",
default=75, type=int)
+ sub_matched.add_argument('--longtail-oa',
+ action='store_true',
+ help="if this is an import of longtail OA content (sets an 'extra' flag)")
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -144,6 +137,7 @@ def main():
args.api = authenticated_api(
args.host_url,
+ # token is an optional kwarg (can be empty string, None, etc)
token=os.environ.get(args.auth_var))
args.func(args)
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index e6f081e5..70f38f5b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -1,7 +1,22 @@
-from .common import FatcatImporter, make_kafka_consumer
+"""
+To run an import you combine two classes; one each of:
+
+- RecordSource: somehow iterates over a source of raw records (eg, from a
+ database, Kafka, files on disk, stdin) and pushes into an entity importer.
+- EntityImporter: class that a record iterator pushes raw (unparsed) records
+ into. The entity importer parses and decides what to do (ignore, update,
+ insert, etc). There is usually a primary entity type, though related entities
+ can be created along the way. Maintains API connection and editgroup/batch
+ state.
+
+"""
+
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
from .grobid_metadata import GrobidMetadataImporter
-from .issn import IssnImporter
+from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
from .orcid import OrcidImporter
+#from .kafka_source import KafkaSource
+#from .file_source import FileSource
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 06897bee..89203a4f 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,6 +3,7 @@ import re
import sys
import csv
import json
+import ftfy
import itertools
import subprocess
from collections import Counter
@@ -12,30 +13,66 @@ import fatcat_client
from fatcat_client.rest import ApiException
-# from: https://docs.python.org/3/library/itertools.html
-def grouper(iterable, n, fillvalue=None):
- "Collect data into fixed-length chunks or blocks"
- args = [iter(iterable)] * n
- return itertools.zip_longest(*args, fillvalue=fillvalue)
+def clean(thing, force_xml=False):
+ """
+ This function is appropriate to be called on any random, non-markup string,
+ such as author names, titles, etc.
-def make_kafka_consumer(hosts, env, topic_suffix, group):
- topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
- client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
- consume_topic = client.topics[topic_name]
- print("Consuming from kafka topic {}, group {}".format(topic_name, group))
+ It will try to clean up commong unicode mangles, HTML characters, etc.
- consumer = consume_topic.get_balanced_consumer(
- consumer_group=group.encode('utf-8'),
- managed=True,
- auto_commit_enable=True,
- auto_commit_interval_ms=30000, # 30 seconds
- compacted_topic=True,
- )
- return consumer
+ This will detect XML/HTML and "do the right thing" (aka, not remove
+ entities like '&amp' if there are tags in the string), unless you pass the
+ 'force_xml' parameter, which might be appropriate for, eg, names and
+ titles, which generally should be projected down to plain text.
+
+ Also strips extra whitespace.
+ """
+ if not thing:
+ return thing
+ fix_entities = 'auto'
+ if force_xml:
+ fix_entities = True
+ fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+ if not fixed:
+ # wasn't zero-length before, but is now; return None
+ return None
+ return fixed
+
+def test_clean():
-class FatcatImporter:
+ assert clean(None) == None
+ assert clean('') == ''
+ assert clean('123') == '123'
+ assert clean('a&amp;b') == 'a&b'
+ assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+ assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+class EntityImporter:
"""
- Base class for fatcat importers
+ Base class for fatcat entity importers.
+
+ The API exposed to record iterator is:
+
+ push_record(raw_record)
+ finish()
+
+ The API that implementations are expected to fill in are:
+
+ want(raw_record) -> boolean
+ parse(raw_record) -> entity
+ try_update(entity) -> boolean
+ insert_batch([entity]) -> None
+
+ This class exposes helpers for implementations:
+
+ self.api
+ self.create_<entity>(entity) -> EntityEdit
+ for related entity types
+ self.push_entity(entity)
+ self.counts['exists'] += 1
+ if didn't update or insert because of existing)
+ self.counts['update'] += 1
+ if updated an entity
"""
def __init__(self, api, **kwargs):
@@ -43,87 +80,135 @@ class FatcatImporter:
eg_extra = kwargs.get('editgroup_extra', dict())
eg_extra['git_rev'] = eg_extra.get('git_rev',
subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter')
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
self.api = api
- self._editgroup_description = kwargs.get('editgroup_description')
- self._editgroup_extra = kwargs.get('editgroup_extra')
- issn_map_file = kwargs.get('issn_map_file')
+ self.bezerk_mode = kwargs.get('bezerk_mode', False)
+ self.edit_batch_size = kwargs.get('edit_batch_size', 100)
+ self.editgroup_description = kwargs.get('editgroup_description')
+ self.editgroup_extra = kwargs.get('editgroup_extra')
+ self.reset()
self._issnl_id_map = dict()
self._orcid_id_map = dict()
- self._doi_id_map = dict()
- if issn_map_file:
- self.read_issn_map_file(issn_map_file)
self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
- self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0})
+ self._doi_id_map = dict()
- def _editgroup(self):
- eg = fatcat_client.Editgroup(
- description=self._editgroup_description,
- extra=self._editgroup_extra,
- )
- return self.api.create_editgroup(eg)
+ def reset(self):
+ self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+ self._edit_count = 0
+ self._editgroup_id = None
+ self._entity_queue = []
- def describe_run(self):
- print("Processed {} lines, inserted {}, updated {}.".format(
- self.counts['processed_lines'], self.counts['insert'], self.counts['update']))
+ def push_record(self, raw_record):
+ """
+ Returns nothing.
+ """
+ if (not raw_record) or (not self.want(raw_record)):
+ self.counts['skip'] += 1
+ return
+ entity = self.parse_record(raw_record)
+ if not entity:
+ self.counts['skip'] += 1
+ return
+ if self.bezerk_mode:
+ self.push_entity(entity)
+ return
+ if self.try_update(entity):
+ self.push_entity(entity)
+ return
- def create_row(self, row, editgroup_id=None):
- # sub-classes expected to implement this
- raise NotImplementedError
+ def finish(self):
+ if self._edit_count > 0:
+ self.api.accept_editgroup(self._editgroup_id)
+ self._editgroup_id = None
+ self._edit_count = 0
+
+ if self._entity_queue:
+ self.insert_batch(self._entity_queue)
+ self.counts['insert'] += len(self._entity_queue)
+ self._entity_queue = []
+
+ self.counts['total'] = 0
+ for key in ('skip', 'insert', 'update', 'exists'):
+ self.counts['total'] += self.counts[key]
+ return self.counts
+
+ def _get_editgroup(self, edits=1):
+ if self._edit_count >= self.edit_batch_size:
+ self.api.accept_editgroup(self._editgroup_id)
+ self._editgroup_id = None
+ self._edit_count = 0
- def create_batch(self, rows, editgroup_id=None):
- # sub-classes expected to implement this
+ if not self._editgroup_id:
+ eg = self.api.create_editgroup(
+ fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra))
+ self._editgroup_id = eg.editgroup_id
+
+ self._edit_count += edits
+ return self._editgroup_id
+
+ def create_container(self, entity):
+ eg_id = self._get_editgroup()
+ self.counts['inserted.container'] += 1
+ return self.api.create_container(entity, editgroup_id=eg_id)
+
+ def create_release(self, entity):
+ eg_id = self._get_editgroup()
+ self.counts['inserted.release'] += 1
+ return self.api.create_release(entity, editgroup_id=eg_id)
+
+ def create_file(self, entity):
+ eg_id = self._get_editgroup()
+ self.counts['inserted.file'] += 1
+ return self.api.create_file(entity, editgroup_id=eg_id)
+
+ def updated(self):
+ """
+ Implementations should call this from try_update() if the update was successful
+ """
+ self.counts['update'] += 1
+
+ def push_entity(self, entity):
+ self._entity_queue.append(entity)
+ if len(self._entity_queue) >= self.edit_batch_size:
+ self.insert_batch(self._entity_queue)
+ self.counts['insert'] += len(_entity_queue)
+ self._entity_queue = 0
+
+ def want(self, raw_record):
+ """
+ Implementations can override for optional fast-path to drop a record.
+ Must have no side-effects; returns bool.
+ """
+ return True
+
+ def parse(self, raw_record):
+ """
+ Returns an entity class type, or None if we should skip this one.
+
+ May have side-effects (eg, create related entities), but shouldn't
+ update/mutate the actual entity.
+ """
raise NotImplementedError
- def process_source(self, source, group_size=100):
- """Creates and auto-accepts editgroup every group_size rows"""
- eg = self._editgroup()
- i = 0
- for i, row in enumerate(source):
- self.create_row(row, editgroup_id=eg.editgroup_id)
- if i > 0 and (i % group_size) == 0:
- self.api.accept_editgroup(eg.editgroup_id)
- eg = self._editgroup()
- self.counts['processed_lines'] += 1
- if i == 0 or (i % group_size) != 0:
- self.api.accept_editgroup(eg.editgroup_id)
-
- def process_batch(self, source, size=50, decode_kafka=False):
- """Reads and processes in batches (not API-call-per-)"""
- for rows in grouper(source, size):
- if decode_kafka:
- rows = [msg.value.decode('utf-8') for msg in rows]
- self.counts['processed_lines'] += len(rows)
- #eg = self._editgroup()
- #self.create_batch(rows, editgroup_id=eg.editgroup_id)
- self.create_batch(rows)
-
- def process_csv_source(self, source, group_size=100, delimiter=','):
- reader = csv.DictReader(source, delimiter=delimiter)
- self.process_source(reader, group_size)
-
- def process_csv_batch(self, source, size=50, delimiter=','):
- reader = csv.DictReader(source, delimiter=delimiter)
- self.process_batch(reader, size)
+ def try_update(self, raw_record):
+ """
+ Passed the output of parse(). Should try to find an existing entity and
+ update it (PUT), decide we should do nothing (based on the existing
+ record), or create a new one.
- def is_issnl(self, issnl):
- return len(issnl) == 9 and issnl[4] == '-'
+ Implementations must update the exists/updated/skip counts
+ appropriately in this method.
- def lookup_issnl(self, issnl):
- """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
- if issnl in self._issnl_id_map:
- return self._issnl_id_map[issnl]
- container_id = None
- try:
- rv = self.api.lookup_container(issnl=issnl)
- container_id = rv.ident
- except ApiException as ae:
- # If anything other than a 404 (not found), something is wrong
- assert ae.status == 404
- self._issnl_id_map[issnl] = container_id # might be None
- return container_id
+ Returns boolean: True if the entity should still be inserted, False otherwise
+ """
+ raise NotImplementedError
+
+ def insert_batch(self, raw_record):
+ raise NotImplementedError
def is_orcid(self, orcid):
return self._orcid_regex.match(orcid) is not None
@@ -163,6 +248,23 @@ class FatcatImporter:
self._doi_id_map[doi] = release_id # might be None
return release_id
+ def is_issnl(self, issnl):
+ return len(issnl) == 9 and issnl[4] == '-'
+
+ def lookup_issnl(self, issnl):
+ """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
+ if issnl in self._issnl_id_map:
+ return self._issnl_id_map[issnl]
+ container_id = None
+ try:
+ rv = self.api.lookup_container(issnl=issnl)
+ container_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._issnl_id_map[issnl] = container_id # might be None
+ return container_id
+
def read_issn_map_file(self, issn_map_file):
print("Loading ISSN map file...")
self._issn_issnl_map = dict()
@@ -179,3 +281,117 @@ class FatcatImporter:
if issn is None:
return None
return self._issn_issnl_map.get(issn)
+
+
+class RecordPusher:
+ """
+ Base class for different importer sources. Pretty trivial interface, just
+ wraps an importer and pushes records in to it.
+ """
+
+ def __init__(self, importer, **kwargs):
+ self.importer = importer
+
+ def run(self):
+ """
+ This will look something like:
+
+ for line in sys.stdin:
+ record = json.loads(line)
+ self.importer.push_record(record)
+ print(self.importer.finish())
+ """
+ raise NotImplementedError
+
+
+class JsonLinePusher(RecordPusher):
+
+ def __init__(self, importer, json_file, **kwargs):
+ self.importer = importer
+ self.json_file = json_file
+
+ def run(self):
+ for line in self.json_file:
+ if not line:
+ continue
+ record = json.loads(line)
+ self.importer.push_record(record)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
+class CsvPusher(RecordPusher):
+
+ def __init__(self, importer, csv_file, **kwargs):
+ self.importer = importer
+ self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ','))
+
+ def run(self):
+ for line in self.reader:
+ if not line:
+ continue
+ self.importer.push_record(line)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
+class LinePusher(RecordPusher):
+
+ def __init__(self, importer, text_file, **kwargs):
+ self.importer = importer
+ self.text_file = text_file
+
+ def run(self):
+ for line in self.text_file:
+ if not line:
+ continue
+ self.importer.push_record(line)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
+class KafkaJsonPusher(RecordPusher):
+
+ def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
+ self.importer = importer
+ self.consumer = make_kafka_consumer(
+ kafka_hosts,
+ kafka_env,
+ topic_suffix,
+ group,
+ )
+
+ def run(self):
+ count = 0
+ for msg in self.consumer:
+ if not msg:
+ continue
+ record = json.loads(msg.value.decode('utf-8'))
+ self.importer.push_record(record)
+ count += 1
+ if count % 500 == 0:
+ print("Import counts: {}".format(self.importer.counts))
+ # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
+ # commit the current batch if it has been lingering
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
+def make_kafka_consumer(hosts, env, topic_suffix, group):
+ topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
+ client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
+ consume_topic = client.topics[topic_name]
+ print("Consuming from kafka topic {}, group {}".format(topic_name, group))
+
+ consumer = consume_topic.get_balanced_consumer(
+ consumer_group=group.encode('utf-8'),
+ managed=True,
+ auto_commit_enable=True,
+ auto_commit_interval_ms=30000, # 30 seconds
+ compacted_topic=True,
+ )
+ return consumer
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 6365e491..00c719f1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -6,7 +6,7 @@ import datetime
import itertools
import subprocess
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
# The docs/guide should be the cannonical home for these mappings; update there
@@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = {
'standard': 'standard',
}
-class CrossrefImporter(FatcatImporter):
+CONTAINER_TYPE_MAP = {
+ 'article-journal': 'journal',
+ 'paper-conference': 'conference',
+ 'book': 'book-series',
+}
+
+# TODO:
+LICENSE_SLUG_MAP = {
+ "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
+ "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+ "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+ "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+ "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+ "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+ "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+ "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+ "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+ "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+ "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+ "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+ "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+ # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+ # http://www.springer.com/tdm doesn't seem like a license
+}
+
+class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
@@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter):
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
+
+ self.create_containers = kwargs.get('create_containers')
extid_map_file = kwargs.get('extid_map_file')
- create_containers = kwargs.get('create_containers')
- check_existing = kwargs.get('check_existing')
self.extid_map_db = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter):
self.extid_map_db = sqlite3.connect(db_uri, uri=True)
else:
print("Not using external ID map")
- self.create_containers = create_containers
- self.check_existing = check_existing
+
+ self.read_issn_map_file(issn_map_file)
def lookup_ext_ids(self, doi):
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
[doi.lower()]).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
row = [str(cell or '') or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
pmcid=row[2],
- wikidata_qid=row[3])
+ wikidata_qid=row[3],
+ # TODO:
+ arxiv_id=None,
+ jstor_id=None,
+ )
def map_release_type(self, crossref_type):
return CROSSREF_TYPE_MAP.get(crossref_type)
- def parse_crossref_dict(self, obj):
+ def map_container_type(self, crossref_type):
+ return CONTAINER_TYPE_MAP.get(crossref_type)
+
+ def want(self, obj):
+ if not obj.get('title'):
+ return False
+
+ # do most of these checks in-line below
+ return True
+
+ def parse_record(self, obj):
"""
obj is a python dict (parsed from json).
returns a ReleaseEntity
"""
- # Do require the 'title' keys to exsit, as release entities do
- if (not 'title' in obj) or (not obj['title']):
- return None
-
# Ways to be out of scope (provisionally)
# journal-issue and journal-volume map to None, but allowed for now
if obj.get('type') in (None, 'journal', 'proceedings',
@@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter):
'book-track', 'proceedings-series'):
return None
- # lookup existing DOI
- existing_release = None
- if self.check_existing:
- try:
- existing_release = self.api.lookup_release(doi=obj['DOI'].lower())
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
-
- # eventually we'll want to support "updates", but for now just skip if
- # entity already exists
- if existing_release:
+ # Do require the 'title' keys to exsit, as release entities do
+ if (not 'title' in obj) or (not obj['title']):
return None
+ release_type = self.map_release_type(obj['type'])
+
# contribs
def do_contribs(obj_list, ctype):
contribs = []
@@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter):
index = i
else:
index = None
+ raw_affiliation = None
if am.get('affiliation'):
- # note: affiliation => affiliations
- extra['affiliations'] = am.get('affiliation')
+ if len(am.get('affiliation')) > 0:
+ raw_affiliation = am.get('affiliation')[0]['name']
+ if len(am.get('affiliation')) > 1:
+ # note: affiliation => more_affiliations
+ extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
if am.get('sequence') and am.get('sequence') != "additional":
- extra['sequence'] = am.get('sequence')
+ extra['seq'] = clean(am.get('sequence'))
if not extra:
extra = None
assert ctype in ("author", "editor", "translator")
contribs.append(fatcat_client.ReleaseContrib(
creator_id=creator_id,
index=index,
- raw_name=raw_name,
+ raw_name=clean(raw_name),
+ raw_affiliation=clean(raw_affiliation),
role=ctype,
extra=extra))
return contribs
@@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter):
container_id = self.lookup_issnl(issnl)
publisher = obj.get('publisher')
- ce = None
if (container_id is None and self.create_containers and (issnl is not None)
and obj.get('container-title') and len(obj['container-title']) > 0):
ce = fatcat_client.ContainerEntity(
issnl=issnl,
- publisher=publisher,
- name=obj['container-title'][0])
+ publisher=clean(publisher),
+ container_type=self.map_container_type(release_type),
+ name=clean(obj['container-title'][0], force_xml=True))
+ ce_edit = self.create_container(ce)
+ container_id = ce_edit.ident
+
+ # license slug
+ license_slug = None
+ license_extra = []
+ for l in obj.get('license', []):
+ if l['content-version'] not in ('vor', 'unspecified'):
+ continue
+ slug = LICENSE_SLUG_MAP.get(l['URL'])
+ if slug:
+ license_slug = slug
+ if 'start' in l:
+ l['start'] = l['start']['date-time']
+ license_extra.append(l)
# references
refs = []
for i, rm in enumerate(obj.get('reference', [])):
try:
year = int(rm.get('year'))
- # NOTE: will need to update/config in the future!
+ # TODO: will need to update/config in the future!
# NOTE: are there crossref works with year < 100?
if year > 2025 or year < 100:
year = None
except:
year = None
- extra = rm.copy()
- if rm.get('DOI'):
- extra['doi'] = rm.get('DOI').lower()
key = rm.get('key')
if key and key.startswith(obj['DOI'].upper()):
key = key.replace(obj['DOI'].upper() + "-", '')
@@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter):
container_name = rm.get('volume-title')
if not container_name:
container_name = rm.get('journal-title')
- extra.pop('DOI', None)
- extra.pop('key', None)
- extra.pop('year', None)
- extra.pop('volume-name', None)
- extra.pop('journal-title', None)
- extra.pop('title', None)
- extra.pop('first-page', None)
- extra.pop('doi-asserted-by', None)
+ elif rm.get('journal-title'):
+ extra['journal-title'] = rm['journal-title']
+ extra = dict()
+ if rm.get('DOI'):
+ extra['doi'] = rm.get('DOI').lower()
+ # TODO: what fields here? CSL citation stuff
+ for k in ('author', 'editor', 'edition', 'authority', 'version',
+ 'genre', 'url', 'event', 'issue', 'volume', 'date',
+ 'accessed_date', 'issued', 'page', 'medium',
+ 'collection_title', 'chapter_number'):
+ if clean(rm.get(k)):
+ extra[k] = clean(rm[k])
if extra:
extra = dict(crossref=extra)
else:
@@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter):
target_release_id=None,
key=key,
year=year,
- container_name=container_name,
- title=rm.get('title'),
- locator=rm.get('first-page'),
+ container_name=clean(container_name),
+ title=clean(rm.get('title')),
+ locator=clean(rm.get('first-page')),
# TODO: just dump JSON somewhere here?
extra=extra))
@@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter):
if obj.get('abstract') != None:
abstracts.append(fatcat_client.ReleaseEntityAbstracts(
mimetype="application/xml+jats",
- content=obj.get('abstract')))
+ content=clean(obj.get('abstract'))))
# extra fields
extra = dict()
- for key in ('subject', 'type', 'license', 'alternative-id',
- 'container-title', 'original-title', 'subtitle', 'archive',
- 'funder', 'group-title'):
- # TODO: unpack "container-title" array
+ for key in ('subject', 'type', 'alternative-id', 'container-title',
+ 'subtitle', 'archive', 'funder', 'group-title'):
+ # TODO: unpack "container-title" array?
val = obj.get(key)
if val:
- extra[key] = val
- if 'license' in extra and extra['license']:
- for i in range(len(extra['license'])):
- if 'start' in extra['license'][i]:
- extra['license'][i]['start'] = extra['license'][i]['start']['date-time']
+ if type(val) == str:
+ extra[key] = clean(val)
+ else:
+ extra[key] = val
+ if license_extra:
+ extra['license'] = license_extra
+
if len(obj['title']) > 1:
- extra['other-titles'] = obj['title'][1:]
- # TODO: this should be top-level
- extra['is_kept'] = len(obj.get('archive', [])) > 0
+ extra['other-titles'] = [clean(t) for t in obj['title'][1:]]
# ISBN
isbn13 = None
@@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter):
re = fatcat_client.ReleaseEntity(
work_id=None,
- title=obj.get('title', [None])[0],
- contribs=contribs,
- refs=refs,
container_id=container_id,
- publisher=publisher,
- release_type=self.map_release_type(obj['type']),
+ title=clean(obj.get('title', [None])[0], force_xml=True),
+ original_title=clean(obj.get('original-title', [None])[0]),
+ release_type=release_type,
release_status=release_status,
+ release_date=release_date,
+ release_year=release_year,
+ publisher=clean(publisher),
doi=obj['DOI'].lower(),
- isbn13=isbn13,
- core_id=extids['core_id'],
pmid=extids['pmid'],
pmcid=extids['pmcid'],
wikidata_qid=extids['wikidata_qid'],
- release_date=release_date,
- release_year=release_year,
- issue=obj.get('issue'),
- volume=obj.get('volume'),
- pages=obj.get('page'),
+ isbn13=isbn13,
+ core_id=extids['core_id'],
+ arxiv_id=extids['arxiv_id'],
+ jstor_id=extids['jstor_id'],
+ volume=clean(obj.get('volume')),
+ issue=clean(obj.get('issue')),
+ pages=clean(obj.get('page')),
+ language=None, # crossref doesn't supply language info
+ license_slug=license_slug,
+ extra=dict(crossref=extra),
abstracts=abstracts,
- extra=dict(crossref=extra))
- return (re, ce)
+ contribs=contribs,
+ refs=refs,
+ )
+ return re
+
+ def try_update(self, re):
+
+ # lookup existing DOI (don't need to try other ext idents for crossref)
+ existing = None
+ try:
+ existing = self.api.lookup_release(doi=re.doi)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ # doesn't exist, need to update
+ return True
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_release_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
- def create_row(self, row, editgroup_id=None):
- if row is None:
- return
- obj = json.loads(row)
- entities = self.parse_crossref_dict(obj)
- if entities is not None:
- (re, ce) = entities
- if ce is not None:
- container = self.api.create_container(ce, editgroup_id=editgroup_id)
- re.container_id = container.ident
- self._issnl_id_map[ce.issnl] = container.ident
- self.api.create_release(re, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- def create_batch(self, batch):
- """Current work/release pairing disallows batch creation of releases.
- Could do batch work creation and then match against releases, but meh."""
- release_batch = []
- for row in batch:
- if row is None:
- continue
- obj = json.loads(row)
- entities = self.parse_crossref_dict(obj)
- if entities is not None:
- (re, ce) = entities
- if ce is not None:
- ce_eg = self.api.create_editgroup(fatcat_client.Editgroup())
- container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id)
- self.api.accept_editgroup(ce_eg.editgroup_id)
- re.container_id = container.ident
- self._issnl_id_map[ce.issnl] = container.ident
- release_batch.append(re)
- self.api.create_release_batch(release_batch, autoaccept="true")
- self.counts['insert'] += len(release_batch)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 5e61a154..9d95fe0b 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,22 @@ import json
import base64
import datetime
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
MAX_ABSTRACT_BYTES=4096
-class GrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(EntityImporter):
+ """
+ This is a complex case: we need to parse and create both file and release entities.
+
+ The "primary" entity here is really File, not Release. If a matching File
+ exists, we bail in want(); if not we insert the Release during parsing, and
+ insert both.
+
+ TODO: should instead check if the File has any releases; if not, insert and update.
+ TODO: relaxing 'None' constraint on parse_record() might make this refactor-able.
+ """
def __init__(self, api, **kwargs):
@@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter):
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
self.default_link_rel = kwargs.get("default_link_rel", "web")
+ self.longtail_oa = kwargs.get("longtail_oa", False)
+
+ def want(self, raw_record):
+ return True
+
+ def parse_record(self, row):
+
+ fields = row.split('\t')
+ sha1_key = fields[0]
+ cdx = json.loads(fields[1])
+ mimetype = fields[2]
+ file_size = int(fields[3])
+ grobid_meta = json.loads(fields[4])
+ fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
+ re = self.parse_grobid_json(grobid_meta)
+
+ if not (fe and re):
+ return None
+
+ # lookup existing file SHA1
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # if file is already in here, presumably not actually long-tail
+ # HACK: this is doing an exists check in parse_record(), which is weird
+ # TODO: this is where we should check if the file actually has
+ # release_ids and/or URLs associated with it
+ if existing and not self.bezerk_mode:
+ self.counts['exists'] += 1
+ self.counts['skip'] -= 1
+ return None
+
+ release_edit = self.create_release(re)
+ fe.release_ids.append(release_edit.ident)
+ return fe
def parse_grobid_json(self, obj):
@@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter):
abobj = dict(
mimetype="text/plain",
language=None,
- content=obj.get('abstract').strip())
+ content=clean(obj.get('abstract')))
abstracts = [abobj]
else:
abstracts = None
@@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter):
for i, a in enumerate(obj.get('authors', [])):
contribs.append(fatcat_client.ReleaseContrib(
index=i,
- raw_name=a['name'],
+ raw_name=clean(a['name']),
role="author",
extra=None))
+ # XXX: why is this a dict()? not covered by tests?
refs = []
for raw in obj.get('citations', []):
cite_extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
+ ref['key'] = clean(raw.get('id'))
if raw.get('title'):
- ref['title'] = raw['title'].strip()
+ ref['title'] = clean(raw['title'])
if raw.get('date'):
try:
year = int(raw['date'].strip()[:4])
@@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter):
pass
for key in ('volume', 'url', 'issue', 'publisher'):
if raw.get(key):
- cite_extra[key] = raw[key].strip()
+ cite_extra[key] = clean(raw[key])
if raw.get('authors'):
- cite_extra['authors'] = [a['name'] for a in raw['authors']]
+ cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
if cite_extra:
cite_extra = dict(grobid=cite_extra)
else:
@@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter):
if obj.get('doi'):
extra['doi'] = obj['doi']
if obj['journal'] and obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
-
- extra['is_longtail_oa'] = True
+ extra['container_name'] = clean(obj['journal']['name'])
# TODO: ISSN/eISSN handling? or just journal name lookup?
+ if self.longtail_oa:
+ extra['longtail_oa'] = True
+
if extra:
extra = dict(grobid=extra)
else:
extra = None
re = fatcat_client.ReleaseEntity(
- title=obj['title'].strip(),
+ title=clean(obj['title'], force_xml=True),
release_type="article-journal",
release_date=release_date,
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=clean(obj['journal'].get('publisher')),
+ volume=clean(obj['journal'].get('volume')),
+ issue=clean(obj['journal'].get('issue')),
abstracts=abstracts,
extra=extra)
return re
@@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter):
sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
- # lookup existing SHA1, or create new entity
- try:
- existing_file = self.api.lookup_file(sha1=sha1)
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
- existing_file = None
-
- if existing_file:
- # if file is already in here, presumably not actually long-tail
- return None
fe = fatcat_client.FileEntity(
sha1=sha1,
size=int(file_size),
@@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter):
# parse URLs and CDX
original = cdx['url']
+ assert len(cdx['dt']) >= 8
wayback = "https://web.archive.org/web/{}/{}".format(
cdx['dt'],
original)
@@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter):
return fe
- def create_row(self, row, editgroup_id=None):
- if not row:
- return
- fields = row.split('\t')
- sha1_key = fields[0]
- cdx = json.loads(fields[1])
- mimetype = fields[2]
- file_size = int(fields[3])
- grobid_meta = json.loads(fields[4])
- fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
- re = self.parse_grobid_json(grobid_meta)
- if fe and re:
- release_entity = self.api.create_release(re, editgroup_id=editgroup_id)
- # release ident can't already be in release list because we just
- # created it
- fe.release_ids.append(release_entity.ident)
- file_entity = self.api.create_file(fe, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- # NB: batch mode not implemented
+ def try_update(self, entity):
+ # did the exists check in 'parse_record()', because we needed to create a release
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_file_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
deleted file mode 100644
index f4d525a4..00000000
--- a/python/fatcat_tools/importers/issn.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-import sys
-import json
-import itertools
-import fatcat_client
-from .common import FatcatImporter
-
-
-def or_none(s):
- if s is None:
- return None
- if len(s) == 0:
- return None
- return s
-
-def truthy(s):
- if s is None:
- return None
- s = s.lower()
-
- if s in ('true', 't', 'yes', 'y', '1'):
- return True
- elif s in ('false', 'f', 'no', 'n', '0'):
- return False
- else:
- return None
-
-class IssnImporter(FatcatImporter):
- """
- Imports journal metadata ("containers") by ISSN, currently from a custom
- (data munged) .csv file format
-
- CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-
- ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
- """
-
- def __init__(self, api, **kwargs):
-
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra)
-
- def parse_issn_row(self, row):
- """
- row is a python dict (parsed from CSV).
- returns a ContainerEntity (or None if invalid or couldn't parse)
- """
- title = or_none(row['title'])
- issnl = or_none(row['ISSN-L'])
- if title is None or issnl is None:
- return None
- extra = dict(
- in_doaj=truthy(row['in_doaj']),
- in_road=truthy(row['in_road']),
- in_norwegian=truthy(row['in_norwegian']),
- language=or_none(row['lang']),
- url=or_none(row['url']),
- ISSNp=or_none(row['ISSN-print']),
- ISSNe=or_none(row['ISSN-electronic']),
- is_oa=truthy(row['is_oa']),
- is_kept=truthy(row['is_kept']),
- )
- ce = fatcat_client.ContainerEntity(
- issnl=issnl,
- name=title,
- publisher=or_none(row['publisher']),
- abbrev=None,
- coden=None,
- extra=extra)
- return ce
-
- def create_row(self, row, editgroup_id=None):
- ce = self.parse_issn_row(row)
- if ce is not None:
- self.api.create_container(ce, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- def create_batch(self, batch):
- """Reads and processes in batches (not API-call-per-line)"""
- objects = [self.parse_issn_row(l)
- for l in batch if (l is not None)]
- objects = [o for o in objects if (o is not None)]
- self.api.create_container_batch(objects, autoaccept="true")
- self.counts['insert'] += len(objects)
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
new file mode 100644
index 00000000..cf3971b5
--- /dev/null
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -0,0 +1,183 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from .common import EntityImporter, clean
+
+
+def or_none(s):
+ if s is None:
+ return None
+ if len(s) == 0:
+ return None
+ return s
+
+def truthy(s):
+ if s is None:
+ return None
+ s = s.lower()
+
+ if s in ('true', 't', 'yes', 'y', '1'):
+ return True
+ elif s in ('false', 'f', 'no', 'n', '0'):
+ return False
+ else:
+ return None
+
+class JournalMetadataImporter(EntityImporter):
+ """
+ Imports journal metadata ("containers") by ISSN, currently from a custom
+ (data munged) .csv file format
+
+ CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+ ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+
+ 'extra' fields:
+
+ doaj
+ as_of: datetime of most recent check; if not set, not actually in DOAJ
+ seal: bool
+ work_level: bool (are work-level publications deposited with DOAJ?)
+ archiving: array, can include 'library' or 'other'
+ road
+ as_of: datetime of most recent check; if not set, not actually in ROAD
+ pubmed (TODO: delete?)
+ as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+ norwegian (TODO: drop this?)
+ as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+ id (integer)
+ level (integer; 0-2)
+ kbart
+ lockss
+ year_rle
+ volume_rle
+ portico
+ ...
+ clockss
+ ...
+ sherpa_romeo
+ color
+ jstor
+ year_rle
+ volume_rle
+ scopus
+ id
+ TODO: print/electronic distinction?
+ wos
+ id
+ doi
+ crossref_doi: DOI of the title in crossref (if exists)
+ prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
+ ia
+ sim
+ nap_id
+ year_rle
+ volume_rle
+ longtail: boolean
+ homepage
+ as_of: datetime of last attempt
+ url
+ status: HTTP/heritrix status of homepage crawl
+
+ issnp: string
+ issne: string
+ coden: string
+ abbrev: string
+ oclc_id: string (TODO: lookup?)
+ lccn_id: string (TODO: lookup?)
+ dblb_id: string
+ default_license: slug
+ original_name: native name (if name is translated)
+ platform: hosting platform: OJS, wordpress, scielo, etc
+ mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+ first_year: year (integer)
+ last_year: if publishing has stopped
+ primary_language: single ISO code, or 'mixed'
+ languages: array of ISO codes
+ region: TODO: continent/world-region
+ nation: shortcode of nation
+ discipline: TODO: highest-level subject; "life science", "humanities", etc
+ field: TODO: narrower description of field
+ subjects: TODO?
+ url: homepage
+ is_oa: boolean. If true, can assume all releases under this container are "Open Access"
+ TODO: domains, if exclusive?
+ TODO: fulltext_regex, if a known pattern?
+
+ For KBART, etc:
+ We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
+ year and volume spans are run-length-encoded arrays, using integers:
+ - if an integer, means that year is preserved
+ - if an array of length 2, means everything between the two numbers (inclusive) is preserved
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+
+ def want(self, raw_record):
+ if raw_record.get('ISSN-L'):
+ return True
+ return False
+
+ def parse_record(self, row):
+ """
+ row is a python dict (parsed from CSV).
+ returns a ContainerEntity (or None if invalid or couldn't parse)
+ """
+ title = or_none(row['title'])
+ issnl = or_none(row['ISSN-L'])
+ if title is None or issnl is None:
+ return None
+ extra = dict(
+ in_doaj=truthy(row['in_doaj']),
+ in_road=truthy(row['in_road']),
+ in_norwegian=truthy(row['in_norwegian']),
+ language=or_none(row['lang']),
+ url=or_none(row['url']),
+ ISSNp=or_none(row['ISSN-print']),
+ ISSNe=or_none(row['ISSN-electronic']),
+ is_oa=truthy(row['is_oa']),
+ is_kept=truthy(row['is_kept']),
+ )
+ ce = fatcat_client.ContainerEntity(
+ issnl=issnl,
+ name=clean(title),
+ publisher=or_none(clean(row['publisher'])),
+ extra=extra)
+ return ce
+
+ def try_update(self, ce):
+
+ existing = None
+ try:
+ existing = self.api.lookup_container(issnl=ce.issnl)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ # doesn't exist, need to update
+ return True
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_container_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 1e5c22f7..2ec6c95d 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,16 +4,10 @@ import json
import sqlite3
import itertools
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
-#row = row.split('\t')
-#assert len(row) == 2
-#sha1 = row[0].replace('sha1:')
-#sha1 = base64.b16encode(base64.b32decode(sha1)).lower()
-#print(sha1)
-#dois = [d.lower() for d in json.loads(row[1])]
-class MatchedImporter(FatcatImporter):
+class MatchedImporter(EntityImporter):
"""
Importer for "file to crossref DOI" matches.
@@ -48,7 +42,6 @@ class MatchedImporter(FatcatImporter):
editgroup_extra=eg_extra)
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.default_mime = kwargs.get("default_mime", None)
- self.skip_file_updates = kwargs.get("skip_file_updates", False)
def make_url(self, raw):
rel = self.default_link_rel
@@ -59,26 +52,13 @@ class MatchedImporter(FatcatImporter):
rel = "repository"
elif "//web.archive.org/" in raw or "//archive.is/" in raw:
rel = "webarchive"
- return fatcat_client.FileEntityUrls(url=raw, rel=rel)
+ return (rel, raw)
- def parse_matched_dict(self, obj):
- sha1 = obj['sha1']
- dois = [d.lower() for d in obj.get('dois', [])]
+ def want(self, raw_record):
+ return True
- # lookup sha1, or create new entity
- fe = None
- if not self.skip_file_updates:
- try:
- fe = self.api.lookup_file(sha1=sha1)
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
- if fe is None:
- fe = fatcat_client.FileEntity(
- sha1=sha1,
- release_ids=[],
- urls=[],
- )
+ def parse_record(self, obj):
+ dois = [d.lower() for d in obj.get('dois', [])]
# lookup dois
re_list = set()
@@ -93,67 +73,77 @@ class MatchedImporter(FatcatImporter):
print("DOI not found: {}".format(doi))
else:
re_list.add(re.ident)
- if len(re_list) == 0:
+ release_ids = list(re_list)
+ if len(release_ids) == 0:
return None
- if fe.release_ids == set(re_list):
- return None
- re_list.update(fe.release_ids)
- fe.release_ids = list(re_list)
# parse URLs and CDX
- existing_urls = [feu.url for feu in fe.urls]
+ urls = set()
for url in obj.get('url', []):
- if url not in existing_urls:
- url = self.make_url(url)
- if url != None:
- fe.urls.append(url)
+ url = self.make_url(url)
+ if url != None:
+ urls.add(url)
for cdx in obj.get('cdx', []):
original = cdx['url']
wayback = "https://web.archive.org/web/{}/{}".format(
cdx['dt'],
original)
- if wayback not in existing_urls:
- fe.urls.append(
- fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
- if original not in existing_urls:
- url = self.make_url(original)
- if url != None:
- fe.urls.append(url)
-
- if obj.get('size') != None:
- fe.size = int(obj['size'])
- fe.sha256 = obj.get('sha256', fe.sha256)
- fe.md5 = obj.get('md5', fe.sha256)
- if obj.get('mimetype') is None:
- if fe.mimetype is None:
- fe.mimetype = self.default_mime
- else:
- fe.mimetype = obj.get('mimetype')
+ urls.add(("webarchive", wayback))
+ url = self.make_url(original)
+ if url != None:
+ urls.add(url)
+ urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls]
+ if len(urls) == 0:
+ return None
+
+ size = obj.get('size')
+ if size:
+ size = int(size)
+
+ fe = fatcat_client.FileEntity(
+ md5=obj.get('md5'),
+ sha1=obj['sha1'],
+ sha256=obj.get('sha256'),
+ size=size,
+ mimetype=obj.get('mimetype'),
+ release_ids=release_ids,
+ urls=urls,
+ )
return fe
- def create_row(self, row, editgroup_id=None):
- obj = json.loads(row)
- fe = self.parse_matched_dict(obj)
- if fe is not None:
- if fe.ident is None:
- self.api.create_file(fe, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
- else:
- self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id)
- self.counts['update'] += 1
-
- def create_batch(self, batch):
- """Reads and processes in batches (not API-call-per-line)"""
- objects = [self.parse_matched_dict(json.loads(l))
- for l in batch if l != None]
- new_objects = [o for o in objects if o != None and o.ident == None]
- update_objects = [o for o in objects if o != None and o.ident != None]
- if len(update_objects):
- update_eg = self._editgroup().editgroup_id
- for obj in update_objects:
- self.api.update_file(obj.ident, obj, editgroup_id=update_eg)
- self.api.accept_editgroup(update_eg)
- if len(new_objects) > 0:
- self.api.create_file_batch(new_objects, autoaccept="true")
- self.counts['update'] += len(update_objects)
- self.counts['insert'] += len(new_objects)
+ def try_update(self, fe):
+ # lookup sha1, or create new entity
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ return True
+
+ fe.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
+ # no new release matches *and* there are already existing URLs
+ self.counts['exists'] += 1
+ return False
+
+ # merge the existing into this one and update
+ existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
+ existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls]
+ existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ existing.mimetype = existing.mimetype or fe.mimetype
+ existing.size = existing.size or fe.size
+ existing.md5 = existing.md5 or fe.md5
+ existing.sha256 = existing.sha256 or fe.sha256
+ self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup())
+ self.counts['update'] += 1
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_file_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 0c8b1d62..02c9bf00 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
import json
import itertools
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
def value_or_none(e):
if type(e) == dict:
@@ -20,7 +20,7 @@ def value_or_none(e):
return None
return e
-class OrcidImporter(FatcatImporter):
+class OrcidImporter(EntityImporter):
def __init__(self, api, **kwargs):
@@ -32,14 +32,16 @@ class OrcidImporter(FatcatImporter):
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
- def parse_orcid_dict(self, obj):
+ def want(self, raw_record):
+ return True
+
+ def parse_record(self, obj):
"""
obj is a python dict (parsed from json).
returns a CreatorEntity
"""
name = obj['person']['name']
- if name is None:
- return None
+ assert name
extra = None
given = value_or_none(name.get('given-names'))
sur = value_or_none(name.get('family-name'))
@@ -61,23 +63,30 @@ class OrcidImporter(FatcatImporter):
return None
ce = fatcat_client.CreatorEntity(
orcid=orcid,
- given_name=given,
- surname=sur,
- display_name=display,
+ given_name=clean(given),
+ surname=clean(sur),
+ display_name=clean(display),
extra=extra)
return ce
- def create_row(self, row, editgroup_id=None):
- obj = json.loads(row)
- ce = self.parse_orcid_dict(obj)
- if ce is not None:
- self.api.create_creator(ce, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
+ def try_update(self, raw_record):
+ existing = None
+ try:
+ existing = self.api.lookup_creator(orcid=raw_record.orcid)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+
+ return True
- def create_batch(self, batch):
- """Reads and processes in batches (not API-call-per-line)"""
- objects = [self.parse_orcid_dict(json.loads(l))
- for l in batch if l != None]
- objects = [o for o in objects if o != None]
- self.api.create_creator_batch(objects, autoaccept="true")
- self.counts['insert'] += len(objects)
+ def insert_batch(self, batch):
+ self.api.create_creator_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index 0f957f9a..2493b1ab 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -1,4 +1,5 @@
+
import collections
from fatcat_client import ReleaseEntity, ApiClient
@@ -26,25 +27,43 @@ def release_to_elasticsearch(release):
Raises exception on error (never returns None)
"""
- if release.state != 'active':
- raise ValueError("Entity is not 'active'")
+ if release.state in ('redirect', 'deleted'):
+ return dict(
+ ident = release.ident,
+ state = release.state,
+ )
+ elif release.state != 'active':
+ raise ValueError("Unhandled release state: {}".format(release.state))
# First, the easy ones (direct copy)
t = dict(
ident = release.ident,
+ state = release.state,
revision = release.revision,
title = release.title,
+ original_title = release.original_title,
release_type = release.release_type,
release_status = release.release_status,
language = release.language,
+ license = release.license_slug,
doi = release.doi,
pmid = release.pmid,
pmcid = release.pmcid,
isbn13 = release.isbn13,
+ wikidata_qid = release.wikidata_qid,
core_id = release.core_id,
- wikidata_qid = release.wikidata_qid
+ arxiv_id = release.core_id,
+ jstor_id = release.jstor_id,
)
+ is_oa = None
+ is_longtail_oa = None
+ in_kbart = None
+ in_web = False
+ in_dweb = False
+ in_ia = False
+ in_shadow = False
+
if release.release_date:
# .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
t['release_date'] = release.release_date.isoformat()
@@ -53,52 +72,99 @@ def release_to_elasticsearch(release):
if release.release_year is not None:
t['release_year'] = release.release_year
+ t['any_abstract'] = len(release.abstracts) > 0
+ t['ref_count'] = len(release.refs or [])
+ t['contrib_count'] = len(release.contribs or [])
+ contrib_names = []
+ for c in (release.contribs or []):
+ if c.raw_name:
+ contrib_names.append(c.raw_name)
+ t['contrib_names'] = contrib_names
+
container = release.container
- container_is_kept = False
if container:
t['publisher'] = container.publisher
t['container_name'] = container.name
t['container_issnl'] = container.issnl
- container_extra = container.extra
- if container_extra:
- t['container_is_oa'] = container_extra.get('is_oa')
- container_is_kept = container_extra.get('is_kept', False)
- t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+ t['container_type'] = container.container_type
+ if container.extra:
+ if container.extra.get('is_oa') or container.extra.get('in_doaj'):
+ is_oa = True
+ if container.extra.get('in_kbart'):
+ # TODO: better KBART check goes here
+ in_kbart = True
+ if container.extra.get('ia'):
+ # TODO: container longtail check goes here
+ # TODO: sim/microfilm check goes here
+ pass
+ # TODO: SHERPA/Romeo goes here
else:
t['publisher'] = release.publisher
files = release.files or []
t['file_count'] = len(files)
- in_wa = False
- in_ia = False
- t['file_pdf_url'] = None
+ t['fileset_count'] = len(release.filesets or [])
+ t['webcapture_count'] = len(release.webcaptures or [])
+ any_pdf_url = None
+ good_pdf_url = None
+ best_pdf_url = None
+ ia_pdf_url = None
for f in files:
+ if f.extra and f.extra.get('shadows'):
+ # TODO: shadow check goes here
+ in_shadows = True
is_pdf = 'pdf' in (f.mimetype or '')
for url in (f.urls or []):
- if url.rel == 'webarchive':
- in_wa = True
- if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''):
+ if url.url.lower().startswith('http'):
+ in_web = True
+ if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+ # TODO: not sure what rel will be
+ in_dweb = True
+ if is_pdf:
+ any_pdf_url = url.url
+ if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
+ is_preserved = True
+ good_pdf_url = url.url
+ if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
in_ia = True
if is_pdf:
- t['file_pdf_url'] = url.url
- if not t['file_pdf_url'] and is_pdf:
- t['file_pdf_url'] = url.url
- t['file_in_webarchive'] = in_wa
- t['file_in_ia'] = in_ia
+ best_pdf_url = url.url
+ ia_pdf_url = url.url
+ # here is where we bake-in priority; IA-specific
+ t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+ t['ia_pdf_url'] = ia_pdf_url
+
+ if release.license_slug:
+ # TODO: more/better checks here, particularly strict *not* OA licenses
+ if release.license_slug.startswith("CC-"):
+ is_oa = True
extra = release.extra or dict()
if extra:
- t['in_shadow'] = extra.get('in_shadow')
- if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
- t['container_is_longtail_oa'] = True
- t['any_abstract'] = bool(release.abstracts)
- t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+ # TODO: longtail OA check from GROBID here
+ if extra.get('in_kbart'):
+ # NOTE: not actually setting this anywhere
+ in_kbart = True
+ if extra.get('is_oa'):
+ # NOTE: not actually setting this anywhere
+ is_oa = True
+ if extra.get('grobid'):
+ if not t.get('container_name'):
+ t['container_name'] = extra['grobid'].get('container_name')
+ if extra['grobid'].get('longtail_oa'):
+ is_longtail_oa = True
+ if extra.get('crossref'):
+ if extra['crossref'].get('archive'):
+ # all crossref archives are KBART, I believe
+ in_kbart = True
- t['ref_count'] = len(release.refs or [])
- t['contrib_count'] = len(release.contribs or [])
- contrib_names = []
- for c in (release.contribs or []):
- if c.raw_name:
- contrib_names.append(c.raw_name)
- t['contrib_names'] = contrib_names
+ if is_longtail_oa:
+ is_oa = True
+ t['is_oa'] = is_oa
+ t['is_longtail_oa'] = is_longtail_oa
+ t['in_kbart'] = in_kbart
+ t['in_web'] = in_web
+ t['in_dweb'] = in_dweb
+ t['in_ia'] = in_ia
+ t['is_preserved'] = in_ia or in_kbart
return t
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 8690a791..636ed304 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -93,7 +93,7 @@ class EntityUpdatesWorker(FatcatWorker):
release_edits = cle['editgroup']['edits']['releases']
for re in release_edits:
ident = re['ident']
- release = self.api.get_release(ident, expand="files,container")
+ release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
release_dict = self.api.api_client.sanitize_for_serialization(release)
producer.produce(
message=json.dumps(release_dict).encode('utf-8'),
diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py
index 8035cbe5..03964c92 100644
--- a/python/fatcat_web/auth.py
+++ b/python/fatcat_web/auth.py
@@ -90,7 +90,10 @@ def handle_ia_xauth(email, password):
'secret': Config.IA_XAUTH_CLIENT_SECRET,
})
if resp.status_code == 401 or (not resp.json().get('success')):
- flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason']))
+ try:
+ flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason']))
+ except:
+ print("IA XAuth fail: {}".format(resp.content))
return render_template('auth_ia_login.html', email=email), resp.status_code
elif resp.status_code != 200:
flash("Internet Archive login failed (internal error?)")
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index a5927d9b..926d5340 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -4,7 +4,7 @@ import json
from flask import Flask, render_template, send_from_directory, request, \
url_for, abort, g, redirect, jsonify, session, flash
from flask_login import login_required
-from fatcat_web import app, api, auth_api
+from fatcat_web import app, api, auth_api, priv_api
from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth
from fatcat_client.rest import ApiException
from fatcat_web.search import do_search
@@ -368,6 +368,8 @@ def search():
@app.route('/auth/login')
def login():
# show the user a list of login options
+ if not priv_api:
+ flash("This web interface not configured with credentials to actually allow login (other than via token)")
return render_template('auth_login.html')
@app.route('/auth/ia/login', methods=['GET', 'POST'])
diff --git a/python/fatcat_web/templates/container_view.html b/python/fatcat_web/templates/container_view.html
index 29f0b9d9..4a175a5d 100644
--- a/python/fatcat_web/templates/container_view.html
+++ b/python/fatcat_web/templates/container_view.html
@@ -15,12 +15,6 @@
<p><b>Publisher:</b>
{% if container.publisher != None %}{{ container.publisher }}{% else %}<i>Unknown</i>{% endif %}
-{% if container.coden != None %}
-<br><b>CODEN<sup><a href="https://en.wikipedia.org/wiki/CODEN">?</a></sup>:</b> &nbsp;<code>{{ container.coden }}</code>
-{% endif %}
-{% if container.abbrev != None %}
-<br><b>Abbrev.:</b> &nbsp;<code>{{ container.abbrev }}</code>
-{% endif %}
{% if (container.extra != None) and (container.extra['url'] != None) and (container.extra['url']|length > 0) %}
<br><b>Homepage:</b> <a href="{{ container.extra['url'] }}">&nbsp;<code>{{ container.extra['url'] }}</code></a>
{% endif %}
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index fd86b7c9..4e24b281 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -143,7 +143,7 @@ Raw Object:
{% endif %}
<br>
-{% if release.refs.size != 0 %}
+{% if release.refs != None and release.refs.size != 0 %}
<h3>References</h3>
This release citing other releases.
<ol>
diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py
index cbe519b0..9ce32ed7 100644
--- a/python/fatcat_web/web_config.py
+++ b/python/fatcat_web/web_config.py
@@ -19,7 +19,7 @@ class Config(object):
GIT_REVISION = subprocess.check_output(["git", "describe", "--always"]).strip().decode('utf-8')
# This is, effectively, the QA/PROD flag
- FATCAT_DOMAIN = os.environ.get("FATCAT_DOMAIN", default="qa.fatcat.wiki")
+ FATCAT_DOMAIN = os.environ.get("FATCAT_DOMAIN", default="dev.fatcat.wiki")
FATCAT_API_AUTH_TOKEN = os.environ.get("FATCAT_API_AUTH_TOKEN", default=None)
FATCAT_API_HOST = os.environ.get("FATCAT_API_HOST", default="https://{}/v0".format(FATCAT_DOMAIN))
@@ -39,10 +39,11 @@ class Config(object):
IA_XAUTH_CLIENT_SECRET = os.environ.get("IA_XAUTH_CLIENT_SECRET", default=None)
# protect cookies (which include API tokens)
- SESSION_COOKIE_HTTPONLY = True
- SESSION_COOKIE_SECURE = True
- SESSION_COOKIE_SAMESITE = 'Lax'
- PERMANENT_SESSION_LIFETIME = 2678400 # 31 days, in seconds
+ if FATCAT_DOMAIN != "dev.fatcat.wiki":
+ SESSION_COOKIE_HTTPONLY = True
+ SESSION_COOKIE_SECURE = True
+ SESSION_COOKIE_SAMESITE = 'Lax'
+ PERMANENT_SESSION_LIFETIME = 2678400 # 31 days, in seconds
try:
GIT_RELEASE = raven.fetch_git_sha('..')
diff --git a/python/tests/api_annotations.py b/python/tests/api_annotations.py
new file mode 100644
index 00000000..0d3c5046
--- /dev/null
+++ b/python/tests/api_annotations.py
@@ -0,0 +1,39 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_annotations(api):
+
+ eg = quick_eg(api)
+
+ # ensure no annotations on this object
+ a = api.get_editgroup_annotations(eg.editgroup_id)
+ assert a == []
+
+ # create an annotation!
+ api.create_editgroup_annotation(
+ eg.editgroup_id,
+ EditgroupAnnotation(
+ comment_markdown="some *annotation*",
+ extra=dict(thing="thang")))
+
+ # check that we can fetch it all sorts of ways
+ a = api.get_editgroup_annotations(eg.editgroup_id)
+ assert len(a) == 1
+ assert a[0].extra['thing'] == "thang"
+
+ # the editor persists, so this is a hack to find a "recent" one
+ a2 = api.get_editor_annotations(eg.editor_id, limit=100)
+ found = None
+ for thing in a2:
+ if thing.annotation_id == a[0].annotation_id:
+ found = thing
+ break
+ assert thing
+ assert thing.extra['thing'] == "thang"
diff --git a/python/tests/api_containers.py b/python/tests/api_containers.py
new file mode 100644
index 00000000..674ae3b8
--- /dev/null
+++ b/python/tests/api_containers.py
@@ -0,0 +1,48 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_container(api):
+ eg = quick_eg(api)
+
+ # all the fields!
+ c1 = ContainerEntity(
+ name="some container name",
+ container_type="journal",
+ publisher="some container publisher",
+ issnl="1234-567X",
+ wikidata_qid="Q954248",
+ extra=dict(a=1, b=2),
+ )
+
+ c1edit = api.create_container(c1, editgroup_id=eg.editgroup_id)
+ api.accept_editgroup(eg.editgroup_id)
+ c2 = api.get_container(c1edit.ident)
+
+ # check that fields match
+ assert c1.name == c2.name
+ assert c1.container_type == c2.container_type
+ assert c1.publisher == c2.publisher
+ assert c1.issnl == c2.issnl
+ assert c1.wikidata_qid == c2.wikidata_qid
+ assert c1.extra == c2.extra
+
+ # expansion
+ # TODO: via release
+ # lookup
+ # TODO: via issnl; but need to generate random identifiers
+
+def test_container_examples(api):
+
+ api.lookup_container(issnl='1549-1277')
+
+ c1 = api.get_container('aaaaaaaaaaaaaeiraaaaaaaaam')
+ assert c1.name == "PLOS Medicine"
+ assert c1.issnl == "1549-1277"
+
diff --git a/python/tests/api_creators.py b/python/tests/api_creators.py
new file mode 100644
index 00000000..7443675b
--- /dev/null
+++ b/python/tests/api_creators.py
@@ -0,0 +1,44 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_creators(api):
+ eg = quick_eg(api)
+
+ # all the fields!
+ c1 = CreatorEntity(
+ display_name="Emma Smith",
+ given_name="emma",
+ surname="smith",
+ orcid="0000-0002-1825-0097",
+ wikidata_qid="Q9542248",
+ extra=dict(a=1, b=5),
+ )
+
+ c1edit = api.create_creator(c1, editgroup_id=eg.editgroup_id)
+ api.accept_editgroup(eg.editgroup_id)
+ c2 = api.get_creator(c1edit.ident)
+
+ # check that fields match
+ assert c1.display_name == c2.display_name
+ assert c1.given_name == c2.given_name
+ assert c1.surname == c2.surname
+ assert c1.orcid == c2.orcid
+ assert c1.wikidata_qid == c2.wikidata_qid
+ assert c1.extra == c2.extra
+
+ # expansion
+ # TODO: via release
+ # lookup
+ # TODO: via issnl; but need to generate random identifiers
+
+def test_creators_examples(api):
+ # TODO: aaaaaaaaaaaaaircaaaaaaaaam
+
+ api.lookup_creator(orcid='0000-0003-3118-6859')
diff --git a/python/tests/api_editgroups.py b/python/tests/api_editgroups.py
new file mode 100644
index 00000000..722d8686
--- /dev/null
+++ b/python/tests/api_editgroups.py
@@ -0,0 +1,140 @@
+
+import json
+import pytest
+import datetime
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_editgroup_submit(api):
+ # 1. check that edit group can be submitted/unsubmitted, and shows up in reviewable appropriately
+ # 2. accepted edits don't show up as reviewable and can't be submitted
+
+ c1 = CreatorEntity(display_name="test updates")
+ eg = quick_eg(api)
+ c1 = api.get_creator(api.create_creator(c1, editgroup_id=eg.editgroup_id).ident)
+
+ eg2 = api.get_editgroup(eg.editgroup_id)
+ assert not eg2.submitted
+ assert not eg2.changelog_index
+
+ reviewable = api.get_editgroups_reviewable(limit=100)
+ assert eg.editgroup_id not in [v.editgroup_id for v in reviewable]
+ wip = api.get_editor_editgroups(eg.editor_id, limit=100)
+ assert eg.editgroup_id in [v.editgroup_id for v in wip]
+
+ api.update_editgroup(eg.editgroup_id, eg2, submit=True)
+ eg3 = api.get_editgroup(eg.editgroup_id)
+ assert eg3.submitted
+ reviewable = api.get_editgroups_reviewable(limit=100)
+ assert eg.editgroup_id in [v.editgroup_id for v in reviewable]
+
+ api.update_editgroup(eg.editgroup_id, eg2, submit=False)
+ eg3 = api.get_editgroup(eg.editgroup_id)
+ assert not eg3.submitted
+ reviewable = api.get_editgroups_reviewable(limit=100)
+ assert eg.editgroup_id not in [v.editgroup_id for v in reviewable]
+
+ # put back in reviewable
+ api.update_editgroup(eg.editgroup_id, eg2, submit=True)
+ reviewable = api.get_editgroups_reviewable(limit=100)
+ assert eg.editgroup_id in [v.editgroup_id for v in reviewable]
+
+ # shouldn't be reviewable if accepted
+ api.accept_editgroup(eg.editgroup_id)
+ reviewable = api.get_editgroups_reviewable(limit=100)
+ assert eg.editgroup_id not in [v.editgroup_id for v in reviewable]
+ eg3 = api.get_editgroup(eg.editgroup_id)
+ #print(eg3)
+ assert eg3.submitted
+ assert eg3.changelog_index
+
+ with pytest.raises(fatcat_client.rest.ApiException):
+ api.update_editgroup(eg.editgroup_id, eg3, submit=True)
+ with pytest.raises(fatcat_client.rest.ApiException):
+ eg3.description = "something"
+ api.update_editgroup(eg.editgroup_id, eg3)
+
+
+def test_editgroup_ordering(api):
+
+ eg1 = quick_eg(api)
+ eg2 = quick_eg(api)
+ api.update_editgroup(
+ eg1.editgroup_id,
+ Editgroup(editgroup_id=eg1.editgroup_id, description="FAIL"),
+ submit=True)
+ api.update_editgroup(
+ eg2.editgroup_id,
+ Editgroup(editgroup_id=eg2.editgroup_id, description="FAIL"),
+ submit=True)
+
+ r1 = api.get_editgroups_reviewable()
+ #print(r1)
+ assert not r1[0].description
+ assert not r1[1].description
+ assert r1[0].submitted >= r1[1].submitted
+
+ # should be no editgroups "in the future" (since now + 1sec)
+ r1 = api.get_editgroups_reviewable(since=(datetime.datetime.utcnow() + datetime.timedelta(seconds=1)).isoformat()+"Z")
+ assert not r1
+
+ r1 = api.get_editgroups_reviewable(since=(datetime.datetime.utcnow() - datetime.timedelta(seconds=5)).isoformat()+"Z")
+ assert r1[0].submitted <= r1[1].submitted
+
+
+def test_editgroup_autoaccept(api):
+ # autoaccept changes: editgroups required when, in what combination
+
+ eg = quick_eg(api)
+ c1 = CreatorEntity(display_name="test autoaccept")
+ c2 = CreatorEntity(display_name="test another autoaccept")
+
+ with pytest.raises(fatcat_client.rest.ApiException):
+ edits = api.create_creator_batch([c1, c2])
+
+ with pytest.raises(fatcat_client.rest.ApiException):
+ edits = api.create_creator_batch([c1, c2], editgroup_id=eg.editgroup_id, autoaccept=True)
+
+ edits1 = api.create_creator_batch([c1, c2], editgroup_id=eg.editgroup_id)
+ edits2 = api.create_creator_batch([c1, c2], autoaccept=True)
+
+ assert edits1[0].editgroup_id == eg.editgroup_id
+ assert edits1[0].editgroup_id != edits2[1].editgroup_id
+ eg1 = api.get_editgroup(edits1[0].editgroup_id)
+ eg2 = api.get_editgroup(edits2[0].editgroup_id)
+
+ assert not eg1.changelog_index
+ assert eg2.changelog_index
+ #print(edits1)
+ #print(eg1.edits.creators)
+ assert eg1.edits.creators[0].ident in [t.ident for t in edits1]
+ assert eg2.edits.creators[0].ident in [t.ident for t in edits2]
+
+
+def test_batch_params(api):
+
+ eg = quick_eg(api)
+ c1 = CreatorEntity(display_name="test autoaccept")
+ c2 = CreatorEntity(display_name="test another autoaccept")
+
+ with pytest.raises(fatcat_client.rest.ApiException):
+ edits = api.create_creator_batch([c1, c2])
+
+ desc = "test description"
+ extra = dict(a=75, q="thing")
+ edits = api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra=json.dumps(extra))
+ eg = api.get_editgroup(edits[0].editgroup_id)
+
+ assert eg.description == desc
+ assert eg.extra == extra
+
+ # currently must manually json dumps() extra field
+ with pytest.raises(fatcat_client.rest.ApiException):
+ api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra=extra)
+
+ with pytest.raises(fatcat_client.rest.ApiException):
+ api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra="{")
diff --git a/python/tests/api_files.py b/python/tests/api_files.py
new file mode 100644
index 00000000..033538ef
--- /dev/null
+++ b/python/tests/api_files.py
@@ -0,0 +1,52 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_file(api):
+
+ eg = quick_eg(api)
+
+ # all the fields!
+ f1 = FileEntity(
+ size=89238,
+ md5="7ce6615b2a5904939576d9567bd5f68e",
+ sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2",
+ sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3",
+ mimetype="application/pdf",
+ extra=dict(a=2, b=5),
+ urls=[
+ FileEntityUrls(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"),
+ ],
+ release_ids=[],
+ )
+
+ f1edit = api.create_file(f1, editgroup_id=eg.editgroup_id)
+ api.accept_editgroup(eg.editgroup_id)
+ f2 = api.get_file(f1edit.ident)
+
+ # check that fields match
+ assert f1.size == f2.size
+ assert f1.md5 == f2.md5
+ assert f1.sha1 == f2.sha1
+ assert f1.sha256 == f2.sha256
+ assert f1.mimetype == f2.mimetype
+ assert f1.extra == f2.extra
+ assert f1.urls == f2.urls
+ assert f1.release_ids == f2.release_ids
+
+ # expansion
+ # TODO: via release
+ # lookup
+ # TODO: via hashes; but need to generate random?
+
+def test_file_examples(api):
+
+ api.lookup_file(sha256='ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362')
+
+ f1 = api.get_file('aaaaaaaaaaaaamztaaaaaaaaam')
diff --git a/python/tests/api_filesets.py b/python/tests/api_filesets.py
new file mode 100644
index 00000000..966b85ca
--- /dev/null
+++ b/python/tests/api_filesets.py
@@ -0,0 +1,79 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_fileset(api):
+
+ eg = quick_eg(api)
+ r1 = ReleaseEntity(title="test fileset release")
+ r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id)
+
+ fs1 = FilesetEntity(
+ manifest = [
+ FilesetEntityManifest(
+ path="data/thing.tar.gz",
+ size=54321,
+ md5="540da3ea6e448d8dfb057c05225f853a",
+ sha1="1dab6a0e110f9b5d70b18db0abf051f7f93faf06",
+ sha256="c7b49f3e84cd1b7cb0b0e3e9f632b7be7e21b4dc229df23331f880a8a7dfa75a",
+ extra={"a": 1, "b": 3},
+ ),
+ FilesetEntityManifest(
+ path="README.md",
+ size=54210,
+ md5="5f83592b5249671719bbed6ce91ecfa8",
+ sha1="455face3598611458efe1f072e58624790a67266",
+ sha256="429bcafa4d3d0072d5b2511e12c85c1aac1d304011d1c406da14707f7b9cd905",
+ extra={"x": 1, "y": "q"},
+ ),
+ ],
+ urls = [
+ FileEntityUrls(url="https://archive.org/download/fileset-123/", rel="repository"),
+ FileEntityUrls(url="https://humble-host.com/~user123/dataset/", rel="web"),
+ ],
+ release_ids = [r1edit.ident],
+ )
+
+ fs1edit = api.create_fileset(fs1, editgroup_id=eg.editgroup_id)
+ api.accept_editgroup(eg.editgroup_id)
+ fs2 = api.get_fileset(fs1edit.ident)
+
+ # check that fields match
+ assert fs1.urls == fs2.urls
+ assert fs1.manifest == fs2.manifest
+ assert fs1.release_ids == fs2.release_ids
+
+ # expansion
+ r1 = api.get_release(r1edit.ident, expand="filesets")
+ assert r1.filesets[0].manifest == fs1.manifest
+
+def test_fileset_examples(api):
+ fs3 = api.get_fileset('aaaaaaaaaaaaaztgaaaaaaaaam')
+
+ assert fs3.urls[0].url == 'http://other-personal-blog.name/dataset/'
+ assert fs3.urls[1].rel == 'archive'
+ assert fs3.manifest[1].md5 == 'f4de91152c7ab9fdc2a128f962faebff'
+ assert fs3.manifest[1].extra['mimetype'] == 'application/gzip'
+
+def test_bad_fileset(api):
+
+ eg = quick_eg(api)
+
+ bad_list = [
+ # good (for testing test itself)
+ #FilesetEntity(manifest=[FilesetEntityManifest(path="123.jpg", size=1234)]),
+ #FilesetEntity(urls=[FileEntityUrls(url="thing", rel="blah")]),
+ FilesetEntity(manifest=[FilesetEntityManifest(path="123.jpg", size="big")]),
+ FilesetEntity(release_ids=["asdf"]),
+ ]
+
+ for b in bad_list:
+ with pytest.raises(fatcat_client.rest.ApiException):
+ api.create_fileset(b, editgroup_id=eg.editgroup_id)
+
diff --git a/python/tests/api_misc.py b/python/tests/api_misc.py
index 3510ea82..0a0f16da 100644
--- a/python/tests/api_misc.py
+++ b/python/tests/api_misc.py
@@ -8,14 +8,6 @@ from fatcat_client.rest import ApiException
from fixtures import *
-def test_lookups(api):
-
- api.lookup_creator(orcid='0000-0003-3118-6859')
- api.lookup_container(issnl='1549-1277')
- api.lookup_file(sha256='ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362')
- api.lookup_release(pmid='54321')
- api.lookup_release(isbn13='978-3-16-148410-0')
-
def test_lookup_hide_extend(api):
r = api.lookup_release(doi='10.1371/journal.pmed.0020124')
diff --git a/python/tests/api_releases.py b/python/tests/api_releases.py
new file mode 100644
index 00000000..ed6f24a4
--- /dev/null
+++ b/python/tests/api_releases.py
@@ -0,0 +1,103 @@
+
+import json
+import pytest
+import datetime
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_release(api):
+
+ eg = quick_eg(api)
+
+ # all the fields!
+ r1 = ReleaseEntity(
+ title="some title",
+ original_title="оригинальное название",
+ release_type="post-weblog",
+ release_status="pre-print",
+ release_date=datetime.datetime.utcnow().date(),
+ release_year=2015,
+ doi="10.5555/12345678",
+ pmid="12345",
+ pmcid="PMC4321",
+ wikidata_qid="Q1234",
+ isbn13="978-3-16-148410-0",
+ core_id="187348",
+ arxiv_id="aslkdjfh",
+ jstor_id="8328424",
+ volume="84",
+ issue="XII",
+ pages="4-99",
+ publisher="some publisher",
+ language="en",
+ license_slug="CC-0",
+ extra=dict(a=1, b=2),
+ contribs=[],
+ refs=[],
+ abstracts=[
+ ReleaseEntityAbstracts(
+ content="this is some abstract",
+ mimetype="text/plain",
+ lang="en"),
+ ReleaseEntityAbstracts(
+ content="this is some other abstract",
+ mimetype="text/plain",
+ lang="de"),
+ ],
+ )
+
+ r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id)
+ api.accept_editgroup(eg.editgroup_id)
+ r2 = api.get_release(r1edit.ident)
+
+ # check that fields match
+ assert r1.title == r2.title
+ assert r1.original_title == r2.original_title
+ assert r1.release_type == r2.release_type
+ assert r1.release_date == r2.release_date
+ assert r1.release_year == r2.release_year
+ assert r1.doi == r2.doi
+ assert r1.pmid == r2.pmid
+ assert r1.pmcid == r2.pmcid
+ assert r1.wikidata_qid == r2.wikidata_qid
+ assert r1.isbn13 == r2.isbn13
+ assert r1.core_id == r2.core_id
+ assert r1.arxiv_id == r2.arxiv_id
+ assert r1.jstor_id == r2.jstor_id
+ assert r1.volume == r2.volume
+ assert r1.issue == r2.issue
+ assert r1.pages == r2.pages
+ assert r1.publisher == r2.publisher
+ assert r1.language == r2.language
+ assert r1.license_slug == r2.license_slug
+ assert r1.extra == r2.extra
+
+ for i in range(len(r1.abstracts)):
+ r1.abstracts[i].content == r2.abstracts[i].content
+ r1.abstracts[i].mimetype == r2.abstracts[i].mimetype
+ r1.abstracts[i].lang == r2.abstracts[i].lang
+ for i in range(len(r1.contribs)):
+ r1.contribs[i] == r2.contribs[i]
+ for i in range(len(r1.refs)):
+ r1.refs[i] == r2.refs[i]
+
+ # expansion
+ # TODO: via work
+ # lookup
+ # TODO: via all; but need to generate random identifiers
+
+def test_release_examples(api):
+
+ api.lookup_release(pmid='54321')
+ api.lookup_release(isbn13='978-3-16-148410-0')
+
+ r1 = api.get_release('aaaaaaaaaaaaarceaaaaaaaaai')
+ assert r1.title == "bigger example"
+ assert len(r1.refs) == 5
+ assert r1.contribs[0].role == "editor"
+ assert r1.abstracts[0].mimetype == "application/xml+jats"
+
diff --git a/python/tests/api_webcaptures.py b/python/tests/api_webcaptures.py
new file mode 100644
index 00000000..dc1754b3
--- /dev/null
+++ b/python/tests/api_webcaptures.py
@@ -0,0 +1,96 @@
+
+import json
+import pytest
+import datetime
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_webcapture(api):
+
+ eg = quick_eg(api)
+ r1 = ReleaseEntity(title="test webcapture release")
+ r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id)
+
+ wc1 = WebcaptureEntity(
+ original_url = "http://example.site",
+ #timestamp = "2012-01-02T03:04:05Z",
+ timestamp = datetime.datetime.now(datetime.timezone.utc),
+ cdx = [
+ WebcaptureEntityCdx(
+ surt="site,example,)/data/thing.tar.gz",
+ #timestamp="2012-01-02T03:04:05Z",
+ timestamp=datetime.datetime.now(datetime.timezone.utc),
+ url="http://example.site/data/thing.tar.gz",
+ mimetype="application/gzip",
+ status_code=200,
+ sha1="455face3598611458efe1f072e58624790a67266",
+ sha256="c7b49f3e84cd1b7cb0b0e3e9f632b7be7e21b4dc229df23331f880a8a7dfa75a",
+ ),
+ WebcaptureEntityCdx(
+ surt="site,example,)/README.md",
+ #timestamp="2012-01-02T03:04:05Z",
+ timestamp=datetime.datetime.now(datetime.timezone.utc),
+ url="http://example.site/README.md",
+ mimetype="text/markdown",
+ status_code=200,
+ sha1="455face3598611458efe1f072e58624790a67266",
+ sha256="429bcafa4d3d0072d5b2511e12c85c1aac1d304011d1c406da14707f7b9cd905",
+ ),
+ ],
+ archive_urls = [
+ FileEntityUrls(rel="wayback", url="https://web.archive.org/web/"),
+ ],
+ release_ids = [r1edit.ident],
+ )
+
+ wc1edit = api.create_webcapture(wc1, editgroup_id=eg.editgroup_id)
+ api.accept_editgroup(eg.editgroup_id)
+ wc2 = api.get_webcapture(wc1edit.ident)
+
+ # check that fields match
+ # I don't know why these aren't equal...
+ #print(wc1.archive_urls)
+ #print(wc2.archive_urls)
+ #assert wc1.archive_urls == wc2.archive_urls
+ assert wc1.archive_urls[0].rel == wc2.archive_urls[0].rel
+ assert wc1.archive_urls[0].url == wc2.archive_urls[0].url
+ assert wc1.cdx == wc2.cdx
+ assert wc1.release_ids == wc2.release_ids
+ assert wc1.timestamp == wc2.timestamp
+ assert wc1.original_url == wc2.original_url
+
+ # TODO: check release expansion
+ r1 = api.get_release(r1edit.ident, expand="webcaptures")
+ print(r1)
+ assert r1.webcaptures[0].cdx == wc1.cdx
+
+def test_webcapture_examples(api):
+ wc3 = api.get_webcapture('aaaaaaaaaaaaa53xaaaaaaaaam')
+
+ assert wc3.cdx[0].surt == 'org,asheesh)/'
+ assert wc3.cdx[1].sha1 == 'a637f1d27d9bcb237310ed29f19c07e1c8cf0aa5'
+ assert wc3.archive_urls[1].rel == 'warc'
+
+
+def test_bad_webcapture(api):
+
+ eg = quick_eg(api)
+
+ bad_list = [
+ # good (for testing test itself)
+ WebcaptureEntity(cdx=[
+ WebcaptureEntityCdx(
+ surt="site,example,)/123.jpg",
+ url="http://example.site/123.jpg",
+ sha1="455face3598611458efe1f072e58624790a67266",
+ timestamp=201506071122)]),
+ ]
+
+ for b in bad_list:
+ with pytest.raises(fatcat_client.rest.ApiException):
+ api.create_webcapture(b, editgroup_id=eg.editgroup_id)
+
diff --git a/python/tests/citation_efficiency.py b/python/tests/citation_efficiency.py
new file mode 100644
index 00000000..fe5006cc
--- /dev/null
+++ b/python/tests/citation_efficiency.py
@@ -0,0 +1,113 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_citation_indexing(api):
+ # indexing is consistent and reacts to change
+
+ eg = quick_eg(api)
+ r1 = ReleaseEntity(title="the target")
+ r1.refs = [
+ ReleaseRef(key="first", title="the first title"),
+ ReleaseRef(key="second", title="the second title"),
+ ReleaseRef(key="third", title="a third title"),
+ ]
+ r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident)
+ api.accept_editgroup(eg.editgroup_id)
+
+ assert r1.refs[0].index == 0
+ assert r1.refs[0].key == "first"
+ assert r1.refs[1].index == 1
+ assert r1.refs[1].key == "second"
+ assert r1.refs[2].index == 2
+ assert r1.refs[2].key == "third"
+
+ r1.refs.pop(1)
+ eg = quick_eg(api)
+ api.update_release(r1.ident, r1, editgroup_id=eg.editgroup_id)
+ api.accept_editgroup(eg.editgroup_id)
+ r1 = api.get_release(r1.ident)
+
+ assert r1.refs[0].index == 0
+ assert r1.refs[0].key == "first"
+ assert r1.refs[1].index == 1
+ assert r1.refs[1].key == "third"
+
+def test_citation_targets(api):
+ # invariant to linking citations
+ # also, updates work
+
+ eg = quick_eg(api)
+ r1 = ReleaseEntity(title="the target")
+ r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident)
+ r2 = ReleaseEntity(title="the citer")
+ r2.refs = [
+ ReleaseRef(key="first", title="something else"),
+ ReleaseRef(key="second", title="the target title"),
+ ]
+ r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident)
+ api.accept_editgroup(eg.editgroup_id)
+
+ eg = quick_eg(api)
+ r2.refs[1].target_release_id = r1.ident
+ api.update_release(r2.ident, r2, editgroup_id=eg.editgroup_id)
+ api.accept_editgroup(eg.editgroup_id)
+ r2 = api.get_release(r2.ident)
+ assert r2.refs[0].key == "first"
+ assert r2.refs[1].key == "second"
+ assert r2.refs[0].index == 0 # TODO: one-indexing?
+ assert r2.refs[1].index == 1
+ assert r2.refs[0].target_release_id == None
+ assert r2.refs[1].target_release_id == r1.ident
+ assert len(r2.refs) == 2
+
+def test_citation_empty_array(api):
+ # distinction between empty array (no citations) and no array (hidden)
+
+ r1 = ReleaseEntity(title="citation null")
+ r2 = ReleaseEntity(title="citation empty array")
+ r1.refs = None
+ r2.refs = []
+
+ eg = quick_eg(api)
+ r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident)
+ r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident)
+ api.accept_editgroup(eg.editgroup_id)
+
+ print(r1.refs)
+ print(r2.refs)
+ assert r1.refs == []
+ assert r1.refs == r2.refs
+
+ r1b = api.get_release(r1.ident, hide="refs")
+ assert r1b.refs == None
+
+def test_citation_encoding(api):
+ # escape-only changes (eg, \u1234 whatever for ASCII)
+
+ r1 = ReleaseEntity(title="citation encoding")
+ title = "title-unicode \\u0050 \\\" "
+ container = "container-unicode ☃︎ ä ö ü スティー"
+ extra = extra={'a': 1, 'b': 2, 'ö': 3}
+ locator = "p123"
+ r1.refs = [
+ ReleaseRef(key="1", year=1923, title=title, container_name=container,
+ extra=extra, locator=locator),
+ ReleaseRef(key="2"),
+ ]
+
+ eg = quick_eg(api)
+ r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident)
+ api.accept_editgroup(eg.editgroup_id)
+
+ assert title == r1.refs[0].title
+ assert container == r1.refs[0].container_name
+ assert extra == r1.refs[0].extra
+ assert locator == r1.refs[0].locator
+
diff --git a/python/tests/cli.sh b/python/tests/cli.sh
index eba6d3a7..19d8a85b 100755
--- a/python/tests/cli.sh
+++ b/python/tests/cli.sh
@@ -14,7 +14,7 @@ set -x
./fatcat_import.py crossref tests/files/crossref-works.2018-01-21.badsample.json tests/files/ISSN-to-ISSN-L.snip.txt
./fatcat_import.py orcid tests/files/0000-0001-8254-7103.json
-./fatcat_import.py issn tests/files/journal_extra_metadata.snip.csv
+./fatcat_import.py journal-metadata tests/files/journal_extra_metadata.snip.csv
./fatcat_import.py matched tests/files/matched_sample.json
./fatcat_import.py matched tests/files/example_matched.json
./fatcat_import.py grobid-metadata tests/files/example_grobid_metadata_lines.tsv
diff --git a/python/tests/files/crossref-works.single.json b/python/tests/files/crossref-works.single.json
index 2af2b358..e3d2e05c 100644
--- a/python/tests/files/crossref-works.single.json
+++ b/python/tests/files/crossref-works.single.json
@@ -84,7 +84,7 @@
{
"given": "Carlos G.",
"family": "Diaz",
- "affiliation": ["Some University"]
+ "affiliation": [{"name": "Some University"}, {"name": "Some Department"}]
},
{
"given": "Francisco M.",
diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py
index 6a880c48..3cc275b3 100644
--- a/python/tests/fixtures.py
+++ b/python/tests/fixtures.py
@@ -28,6 +28,7 @@ def api():
conf.api_key["Authorization"] = os.getenv("FATCAT_API_AUTH_TOKEN")
conf.api_key_prefix["Authorization"] = "Bearer"
api_client = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+ api_client.editor_id = "aaaaaaaaaaaabkvkaaaaaaaaae"
return api_client
def test_get_changelog_entry(api):
@@ -38,33 +39,6 @@ def test_get_changelog_entry(api):
## Helpers ##################################################################
def quick_eg(api_inst):
- eg = api_inst.create_editgroup(
- fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ eg = api_inst.create_editgroup(fatcat_client.Editgroup())
return eg
-# TODO: what are these even here for?
-def check_entity_fields(e):
- for key in ('rev', 'is_live', 'redirect_id'):
- assert key in e
- for key in ('id',):
- assert e[key] is not None
-
-def check_release(e):
- for key in ('work', 'release_type'):
- assert key in e
- for key in ('title', ):
- assert e[key] is not None
- for key in ('refs', 'creators'):
- assert type(e[key]) == list
-
-def check_creator(e):
- for key in ('name',):
- assert e[key] is not None
-
-def check_container(e):
- for key in ('name',):
- assert e[key] is not None
-
-def check_file(e):
- for key in ('size', 'sha1'):
- assert e[key] is not None
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index e2ca6122..193f78f6 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -1,35 +1,51 @@
import json
import pytest
-from fatcat_tools.importers import CrossrefImporter
+from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
from fixtures import api
@pytest.fixture(scope="function")
def crossref_importer(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', check_existing=False)
+ yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True)
@pytest.fixture(scope="function")
def crossref_importer_existing(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', check_existing=True)
+ yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
def test_crossref_importer_batch(crossref_importer):
with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
- crossref_importer.process_batch(f)
+ JsonLinePusher(crossref_importer, f).run()
def test_crossref_importer(crossref_importer):
+ last_index = crossref_importer.api.get_changelog(limit=1)[0].index
with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
- crossref_importer.process_source(f)
+ crossref_importer.bezerk_mode = True
+ counts = JsonLinePusher(crossref_importer, f).run()
+ assert counts['insert'] == 14
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
# fetch most recent editgroup
- changes = crossref_importer.api.get_changelog(limit=1)
- eg = changes[0].editgroup
+ change = crossref_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
assert eg.description
assert "crossref" in eg.description.lower()
assert eg.extra['git_rev']
assert "fatcat_tools.CrossrefImporter" in eg.extra['agent']
+ last_index = crossref_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
+ crossref_importer.bezerk_mode = False
+ crossref_importer.reset()
+ counts = JsonLinePusher(crossref_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 14
+ assert counts['skip'] == 0
+ assert last_index == crossref_importer.api.get_changelog(limit=1)[0].index
+
def test_crossref_mappings(crossref_importer):
assert crossref_importer.map_release_type('journal-article') == "article-journal"
assert crossref_importer.map_release_type('asdf') is None
@@ -39,13 +55,13 @@ def test_crossref_mappings(crossref_importer):
def test_crossref_importer_create(crossref_importer):
crossref_importer.create_containers = True
with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
- crossref_importer.process_source(f)
+ JsonLinePusher(crossref_importer, f).run()
def test_crossref_dict_parse(crossref_importer):
with open('tests/files/crossref-works.single.json', 'r') as f:
# not a single line
raw = json.loads(f.read())
- (r, c) = crossref_importer.parse_crossref_dict(raw)
+ r = crossref_importer.parse_record(raw)
extra = r.extra['crossref']
assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators"
assert r.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t"
@@ -61,7 +77,8 @@ def test_crossref_dict_parse(crossref_importer):
assert len(r.contribs) == 5
assert r.contribs[0].raw_name == "Marcelo D. Radicioni"
assert r.contribs[0].index == 0
- assert r.contribs[1].extra['affiliations'] == ["Some University"]
+ assert r.contribs[1].raw_affiliation == "Some University"
+ assert r.contribs[1].extra['more_affiliations'] == ["Some Department"]
assert r.contribs[1].role == "author"
assert r.contribs[3].role == "editor"
assert r.contribs[3].index is None
@@ -78,8 +95,10 @@ def test_crossref_dict_parse(crossref_importer):
def test_stateful_checking(crossref_importer_existing):
with open('tests/files/crossref-works.single.json', 'r') as f:
# not a single line, a whole document
- raw = json.loads(f.read())
+ raw = f.read()
# might not exist yet...
- crossref_importer_existing.process_source([json.dumps(raw)])
- # ok, make sure we get 'None' back
- assert crossref_importer_existing.parse_crossref_dict(raw) is None
+ crossref_importer_existing.push_record(json.loads(raw))
+ crossref_importer_existing.finish()
+ # make sure we wouldn't insert again
+ entity = crossref_importer_existing.parse_record(json.loads(raw))
+ assert crossref_importer_existing.try_update(entity) is False
diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py
index 97ebcaef..4fed4aaa 100644
--- a/python/tests/import_grobid_metadata.py
+++ b/python/tests/import_grobid_metadata.py
@@ -3,7 +3,7 @@ import os
import json
import base64
import pytest
-from fatcat_tools.importers import GrobidMetadataImporter
+from fatcat_tools.importers import GrobidMetadataImporter, LinePusher
from fixtures import api
"""
@@ -15,10 +15,6 @@ side-effects. Should probably be disabled or re-written.
def grobid_metadata_importer(api):
yield GrobidMetadataImporter(api)
-# TODO: use API to check that entities actually created...
-#def test_grobid_metadata_importer_batch(grobid_metadata_importer):
-# with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
-# grobid_metadata_importer.process_batch(f)
def test_grobid_metadata_parse(grobid_metadata_importer):
with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
@@ -30,7 +26,8 @@ def test_grobid_metadata_parse(grobid_metadata_importer):
print(re.contribs)
assert re.contribs[0].raw_name == "Wahyu Ary"
assert re.publisher == None
- assert re.extra.get('container_name') == None
+ if re.extra:
+ assert re.extra.get('container_name') == None
assert len(re.refs) == 27
def test_file_metadata_parse(grobid_metadata_importer):
@@ -53,13 +50,28 @@ def test_file_metadata_parse(grobid_metadata_importer):
assert len(fe.release_ids) == 0
def test_grobid_metadata_importer(grobid_metadata_importer):
+ last_index = grobid_metadata_importer.api.get_changelog(limit=1)[0].index
with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
- grobid_metadata_importer.process_source(f)
+ grobid_metadata_importer.bezerk_mode = True
+ counts = LinePusher(grobid_metadata_importer, f).run()
+ assert counts['insert'] == 10
+ assert counts['inserted.release'] == 10
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
# fetch most recent editgroup
- changes = grobid_metadata_importer.api.get_changelog(limit=1)
- eg = changes[0].editgroup
+ change = grobid_metadata_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
assert eg.description
assert "grobid" in eg.description.lower()
assert eg.extra['git_rev']
assert "fatcat_tools.GrobidMetadataImporter" in eg.extra['agent']
+
+ with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
+ grobid_metadata_importer.reset()
+ grobid_metadata_importer.bezerk_mode = False
+ counts = LinePusher(grobid_metadata_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['inserted.release'] == 0
+ assert counts['exists'] == 10
+ assert counts['skip'] == 0
diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py
deleted file mode 100644
index 6b5978d9..00000000
--- a/python/tests/import_issn.py
+++ /dev/null
@@ -1,26 +0,0 @@
-
-import pytest
-from fatcat_tools.importers import IssnImporter
-from fixtures import api
-
-
-@pytest.fixture(scope="function")
-def issn_importer(api):
- yield IssnImporter(api)
-
-# TODO: use API to check that entities actually created...
-def test_issn_importer_batch(issn_importer):
- with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
- issn_importer.process_csv_batch(f)
-
-def test_issn_importer(issn_importer):
- with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
- issn_importer.process_csv_source(f)
-
- # fetch most recent editgroup
- changes = issn_importer.api.get_changelog(limit=1)
- eg = changes[0].editgroup
- assert eg.description
- assert "container" in eg.description.lower()
- assert eg.extra['git_rev']
- assert "fatcat_tools.IssnImporter" in eg.extra['agent']
diff --git a/python/tests/import_journal_metadata.py b/python/tests/import_journal_metadata.py
new file mode 100644
index 00000000..a2b10a65
--- /dev/null
+++ b/python/tests/import_journal_metadata.py
@@ -0,0 +1,39 @@
+
+import pytest
+from fatcat_tools.importers import JournalMetadataImporter, CsvPusher
+from fixtures import api
+
+
+@pytest.fixture(scope="function")
+def journal_metadata_importer(api):
+ yield JournalMetadataImporter(api)
+
+# TODO: use API to check that entities actually created...
+def test_journal_metadata_importer_batch(journal_metadata_importer):
+ with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
+ CsvPusher(journal_metadata_importer, f).run()
+
+def test_journal_metadata_importer(journal_metadata_importer):
+ last_index = journal_metadata_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
+ journal_metadata_importer.bezerk_mode = True
+ counts = CsvPusher(journal_metadata_importer, f).run()
+ assert counts['insert'] == 9
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = journal_metadata_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "container" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.JournalMetadataImporter" in eg.extra['agent']
+
+ with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
+ journal_metadata_importer.reset()
+ journal_metadata_importer.bezerk_mode = False
+ counts = CsvPusher(journal_metadata_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 9
+ assert counts['skip'] == 0
diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py
index 080674ac..8f694456 100644
--- a/python/tests/import_matched.py
+++ b/python/tests/import_matched.py
@@ -1,7 +1,7 @@
import json
import pytest
-from fatcat_tools.importers import MatchedImporter
+from fatcat_tools.importers import MatchedImporter, JsonLinePusher
from fixtures import api
@@ -10,26 +10,40 @@ def matched_importer(api):
yield MatchedImporter(api)
# TODO: use API to check that entities actually created...
-def test_matched_importer_batch(matched_importer):
+def test_matched_importer(matched_importer):
with open('tests/files/example_matched.json', 'r') as f:
- matched_importer.process_batch(f)
+ JsonLinePusher(matched_importer, f).run()
def test_matched_importer(matched_importer):
+ last_index = matched_importer.api.get_changelog(limit=1)[0].index
with open('tests/files/example_matched.json', 'r') as f:
- matched_importer.process_source(f)
+ matched_importer.bezerk_mode = True
+ counts = JsonLinePusher(matched_importer, f).run()
+ assert counts['insert'] == 2
+ assert counts['exists'] == 0
+ assert counts['skip'] == 11
# fetch most recent editgroup
- changes = matched_importer.api.get_changelog(limit=1)
- eg = changes[0].editgroup
+ change = matched_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
assert eg.description
assert "file-to-release" in eg.description.lower()
assert eg.extra['git_rev']
assert "fatcat_tools.MatchedImporter" in eg.extra['agent']
+ # re-insert; should skip
+ with open('tests/files/example_matched.json', 'r') as f:
+ matched_importer.reset()
+ matched_importer.bezerk_mode = False
+ counts = JsonLinePusher(matched_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 2
+ assert counts['skip'] == 11
+
def test_matched_dict_parse(matched_importer):
with open('tests/files/example_matched.json', 'r') as f:
raw = json.loads(f.readline())
- f = matched_importer.parse_matched_dict(raw)
+ f = matched_importer.parse_record(raw)
assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313"
assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff"
assert f.mimetype == "application/pdf"
diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py
index 717a1328..57886b52 100644
--- a/python/tests/import_orcid.py
+++ b/python/tests/import_orcid.py
@@ -1,7 +1,7 @@
import json
import pytest
-from fatcat_tools.importers import OrcidImporter
+from fatcat_tools.importers import OrcidImporter, JsonLinePusher
from fixtures import api
@@ -9,37 +9,46 @@ from fixtures import api
def orcid_importer(api):
yield OrcidImporter(api)
-# TODO: use API to check that entities actually created...
-def test_orcid_importer_batch(orcid_importer):
- with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
- orcid_importer.process_batch(f)
-
def test_orcid_importer_badid(orcid_importer):
with open('tests/files/0000-0001-8254-710X.json', 'r') as f:
- orcid_importer.process_batch(f)
+ JsonLinePusher(orcid_importer, f).run()
+# TODO: use API to check that entities actually created...
def test_orcid_importer(orcid_importer):
+ last_index = orcid_importer.api.get_changelog(limit=1)[0].index
with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
- orcid_importer.process_source(f)
+ orcid_importer.bezerk_mode = True
+ counts = JsonLinePusher(orcid_importer, f).run()
+ assert counts['insert'] == 1
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
# fetch most recent editgroup
- changes = orcid_importer.api.get_changelog(limit=1)
- eg = changes[0].editgroup
+ change = orcid_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
assert eg.description
assert "orcid" in eg.description.lower()
assert eg.extra['git_rev']
assert "fatcat_tools.OrcidImporter" in eg.extra['agent']
+ with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
+ orcid_importer.reset()
+ orcid_importer.bezerk_mode = False
+ counts = JsonLinePusher(orcid_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 1
+ assert counts['skip'] == 0
+
def test_orcid_importer_x(orcid_importer):
with open('tests/files/0000-0003-3953-765X.json', 'r') as f:
- orcid_importer.process_source(f)
+ JsonLinePusher(orcid_importer, f).run()
c = orcid_importer.api.lookup_creator(orcid="0000-0003-3953-765X")
assert c is not None
def test_orcid_dict_parse(orcid_importer):
with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
raw = json.loads(f.readline())
- c = orcid_importer.parse_orcid_dict(raw)
+ c = orcid_importer.parse_record(raw)
assert c.given_name == "Man-Hui"
assert c.surname == "Li"
assert c.display_name == "Man-Hui Li"
diff --git a/python/tests/importer.py b/python/tests/importer.py
index 34efa5d8..9308ba84 100644
--- a/python/tests/importer.py
+++ b/python/tests/importer.py
@@ -1,13 +1,13 @@
import pytest
-from fatcat_tools.importers import FatcatImporter
+from fatcat_tools.importers import CrossrefImporter, OrcidImporter
from fixtures import api
def test_issnl_mapping_lookup(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- fi = FatcatImporter(api, issn_map_file=issn_file)
+ fi = CrossrefImporter(api, issn_map_file=issn_file)
assert fi.issn2issnl('0000-0027') == '0002-0027'
assert fi.issn2issnl('0002-0027') == '0002-0027'
@@ -18,20 +18,18 @@ def test_issnl_mapping_lookup(api):
def test_identifiers(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- fi = FatcatImporter(api, issn_map_file=issn_file)
-
- assert fi.is_issnl("1234-5678") == True
- assert fi.is_issnl("1234-5678.") == False
- assert fi.is_issnl("12345678") == False
- assert fi.is_issnl("1-2345678") == False
-
- assert fi.is_doi("10.1234/56789") == True
- assert fi.is_doi("101234/56789") == False
- assert fi.is_doi("10.1234_56789") == False
-
- assert fi.is_orcid("0000-0003-3118-6591") == True
- assert fi.is_orcid("0000-0003-3953-765X") == True
- assert fi.is_orcid("0000-00x3-3118-659") == False
- assert fi.is_orcid("0000-00033118-659") == False
- assert fi.is_orcid("0000-0003-3118-659.") == False
+ ci = CrossrefImporter(api, issn_map_file=issn_file)
+
+ assert ci.is_issnl("1234-5678") == True
+ assert ci.is_issnl("1234-5678.") == False
+ assert ci.is_issnl("12345678") == False
+ assert ci.is_issnl("1-2345678") == False
+
+ oi = OrcidImporter(api)
+
+ assert oi.is_orcid("0000-0003-3118-6591") == True
+ assert oi.is_orcid("0000-0003-3953-765X") == True
+ assert oi.is_orcid("0000-00x3-3118-659") == False
+ assert oi.is_orcid("0000-00033118-659") == False
+ assert oi.is_orcid("0000-0003-3118-659.") == False
diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py
index e9d23250..6d6c6c82 100644
--- a/python/tests/transform_tests.py
+++ b/python/tests/transform_tests.py
@@ -11,7 +11,7 @@ def test_elasticsearch_convert(crossref_importer):
with open('tests/files/crossref-works.single.json', 'r') as f:
# not a single line
raw = json.loads(f.read())
- (r, c) = crossref_importer.parse_crossref_dict(raw)
+ r = crossref_importer.parse_record(raw)
r.state = 'active'
release_to_elasticsearch(r)