diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-23 21:07:01 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-23 21:07:01 -0800 |
commit | 642501efc92e31ac438fe0f70820415c825a3802 (patch) | |
tree | 967fa2a0c015f221bce1ad1bcdc6c3c021e837f5 | |
parent | 1536109fff643767b6e8f8515fe0eb4d8cae5854 (diff) | |
download | fatcat-642501efc92e31ac438fe0f70820415c825a3802.tar.gz fatcat-642501efc92e31ac438fe0f70820415c825a3802.zip |
start changes to release ES schema
-rw-r--r-- | extra/elasticsearch/release_schema.json | 61 | ||||
-rw-r--r-- | python/Pipfile.lock | 189 | ||||
-rw-r--r-- | python/fatcat_tools/transforms.py | 120 | ||||
-rw-r--r-- | python/tests/import_crossref.py | 2 | ||||
-rw-r--r-- | python/tests/import_grobid_metadata.py | 3 |
5 files changed, 234 insertions, 141 deletions
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 9a712ad6..4cfe0894 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -31,39 +31,56 @@ "state": { "type": "keyword" }, "revision": { "type": "keyword" }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "author": { "type": "alias", "path": "contrib_names" }, - "journal": { "type": "alias", "path": "container_name" }, - "date": { "type": "alias", "path": "release_date" }, - "year": { "type": "alias", "path": "release_year" }, - "issn": { "type": "alias", "path": "container_issnl" }, - "oa": { "type": "alias", "path": "container_is_oa" }, - "longtail": { "type": "alias", "path": "container_is_longtail_oa" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "release_date": { "type": "date" }, "release_year": { "type": "integer" }, "release_type": { "type": "keyword" }, "release_status": { "type": "keyword" }, - "language": { "type": "keyword" }, - "doi": { "type": "keyword" }, - "pmid": { "type": "keyword" }, - "pmcid": { "type": "keyword" }, - "isbn13": { "type": "keyword" }, - "core_id": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, + "language": { "type": "keyword" }, + "doi": { "type": "keyword" }, + "pmid": { "type": "keyword" }, + "pmcid": { "type": "keyword" }, + "isbn13": { "type": "keyword" }, + "wikidata_qid": { "type": "keyword" }, + "core_id": { "type": "keyword" }, + "axiv_id": { "type": "keyword" }, + "jstor_id": { "type": "keyword" }, + "license": { "type": "keyword" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "container_issnl": { "type": "keyword" }, - "container_is_oa": { "type": "boolean" }, - "container_is_longtail_oa": { "type": "boolean" }, + "container_type": { "type": "keyword" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "ref_count": { "type": "integer" }, "file_count": { "type": "integer" }, - "file_pdf_url": { "type": "keyword" }, - "file_in_webarchive": { "type": "boolean" }, - "file_in_ia": { "type": "boolean" }, + "fileset_count": { "type": "integer" }, + "webcapture_count": { "type": "integer" }, "any_abstract": { "type": "boolean" }, - "is_kept": { "type": "boolean" }, - "in_shadow": { "type": "boolean" } + + "best_pdf_url": { "type": "keyword" }, + "ia_pdf_url": { "type": "keyword" }, + "is_oa": { "type": "boolean" }, + "is_longtail_oa": { "type": "boolean" }, + "is_preserved": { "type": "boolean" }, + "in_kbart": { "type": "boolean" }, + "in_jstor": { "type": "boolean" }, + "in_dweb": { "type": "boolean" }, + "in_web": { "type": "boolean" }, + "in_ia": { "type": "boolean" }, + "in_sim": { "type": "boolean" }, + "in_shadows": { "type": "boolean" }, + + "author": { "type": "alias", "path": "contrib_names" }, + "journal": { "type": "alias", "path": "container_name" }, + "date": { "type": "alias", "path": "release_date" }, + "year": { "type": "alias", "path": "release_year" }, + "issn": { "type": "alias", "path": "container_issnl" }, + "oa": { "type": "alias", "path": "is_oa" }, + "longtail": { "type": "alias", "path": "is_longtail_oa" }, + "lang": { "type": "alias", "path": "language" }, + "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, + "is_kept": { "type": "alias", "path": "in_kbart" } } } } diff --git a/python/Pipfile.lock b/python/Pipfile.lock index 296079f0..f2d39a99 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "c3deb49cf4c122c2aed3f4f944e9763cfcf40c85891ca3d3e9cabc3debbb9075" + "sha256": "8f98bb3f6a3083c8b03cb68d1ee48b25449a950dd8a9d15189f2eb4fae48f760" }, "pipfile-spec": 6, "requires": { @@ -96,27 +96,27 @@ }, "cryptography": { "hashes": [ - "sha256:05a6052c6a9f17ff78ba78f8e6eb1d777d25db3b763343a1ae89a7a8670386dd", - "sha256:0eb83a24c650a36f68e31a6d0a70f7ad9c358fa2506dc7b683398b92e354a038", - "sha256:0ff4a3d6ea86aa0c9e06e92a9f986de7ee8231f36c4da1b31c61a7e692ef3378", - "sha256:1699f3e916981df32afdd014fb3164db28cdb61c757029f502cb0a8c29b2fdb3", - "sha256:1b1f136d74f411f587b07c076149c4436a169dc19532e587460d9ced24adcc13", - "sha256:21e63dd20f5e5455e8b34179ac43d95b3fb1ffa54d071fd2ed5d67da82cfe6dc", - "sha256:2454ada8209bbde97065453a6ca488884bbb263e623d35ba183821317a58b46f", - "sha256:3cdc5f7ca057b2214ce4569e01b0f368b3de9d8ee01887557755ccd1c15d9427", - "sha256:418e7a5ec02a7056d3a4f0c0e7ea81df374205f25f4720bb0e84189aa5fd2515", - "sha256:471a097076a7c4ab85561d7fa9a1239bd2ae1f9fd0047520f13d8b340bf3210b", - "sha256:5ecaf9e7db3ca582c6de6229525d35db8a4e59dc3e8a40a331674ed90e658cbf", - "sha256:63b064a074f8dc61be81449796e2c3f4e308b6eba04a241a5c9f2d05e882c681", - "sha256:6afe324dfe6074822ccd56d80420df750e19ac30a4e56c925746c735cf22ae8b", - "sha256:70596e90398574b77929cd87e1ac6e43edd0e29ba01e1365fed9c26bde295aa5", - "sha256:70c2b04e905d3f72e2ba12c58a590817128dfca08949173faa19a42c824efa0b", - "sha256:8908f1db90be48b060888e9c96a0dee9d842765ce9594ff6a23da61086116bb6", - "sha256:af12dfc9874ac27ebe57fc28c8df0e8afa11f2a1025566476b0d50cdb8884f70", - "sha256:b4fc04326b2d259ddd59ed8ea20405d2e695486ab4c5e1e49b025c484845206e", - "sha256:da5b5dda4aa0d5e2b758cc8dfc67f8d4212e88ea9caad5f61ba132f948bab859" - ], - "version": "==2.4.2" + "sha256:05b3ded5e88747d28ee3ef493f2b92cbb947c1e45cf98cfef22e6d38bb67d4af", + "sha256:06826e7f72d1770e186e9c90e76b4f84d90cdb917b47ff88d8dc59a7b10e2b1e", + "sha256:08b753df3672b7066e74376f42ce8fc4683e4fd1358d34c80f502e939ee944d2", + "sha256:2cd29bd1911782baaee890544c653bb03ec7d95ebeb144d714b0f5c33deb55c7", + "sha256:31e5637e9036d966824edaa91bf0aa39dc6f525a1c599f39fd5c50340264e079", + "sha256:42fad67d7072216a49e34f923d8cbda9edacbf6633b19a79655e88a1b4857063", + "sha256:4946b67235b9d2ea7d31307be9d5ad5959d6c4a8f98f900157b47abddf698401", + "sha256:522fdb2809603ee97a4d0ef2f8d617bc791eb483313ba307cb9c0a773e5e5695", + "sha256:6f841c7272645dd7c65b07b7108adfa8af0aaea57f27b7f59e01d41f75444c85", + "sha256:7d335e35306af5b9bc0560ca39f740dfc8def72749645e193dd35be11fb323b3", + "sha256:8504661ffe324837f5c4607347eeee4cf0fcad689163c6e9c8d3b18cf1f4a4ad", + "sha256:9260b201ce584d7825d900c88700aa0bd6b40d4ebac7b213857bd2babee9dbca", + "sha256:9a30384cc402eac099210ab9b8801b2ae21e591831253883decdb4513b77a3cd", + "sha256:9e29af877c29338f0cab5f049ccc8bd3ead289a557f144376c4fbc7d1b98914f", + "sha256:ab50da871bc109b2d9389259aac269dd1b7c7413ee02d06fe4e486ed26882159", + "sha256:b13c80b877e73bcb6f012813c6f4a9334fcf4b0e96681c5a15dac578f2eedfa0", + "sha256:bfe66b577a7118e05b04141f0f1ed0959552d45672aa7ecb3d91e319d846001e", + "sha256:e091bd424567efa4b9d94287a952597c05d22155a13716bf5f9f746b9dc906d3", + "sha256:fa2b38c8519c5a3aa6e2b4e1cf1a549b54acda6adb25397ff542068e73d1ed00" + ], + "version": "==2.5" }, "fatcat-client": { "editable": true, @@ -152,6 +152,14 @@ "index": "pypi", "version": "==0.2" }, + "ftfy": { + "hashes": [ + "sha256:84a1614190173bb447ac9d581e50185c6aa35b538754b6bedaba0cc0f83d8e80", + "sha256:fa74757fb7cb444366fa6a79c2feabd40281a44dfbf6eaed492a804764ee26b2" + ], + "index": "pypi", + "version": "==5.5.1" + }, "idna": { "hashes": [ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", @@ -366,6 +374,13 @@ ], "version": "==1.24.1" }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + }, "werkzeug": { "hashes": [ "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", @@ -558,10 +573,10 @@ }, "parso": { "hashes": [ - "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", - "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" + "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d", + "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31" ], - "version": "==0.3.1" + "version": "==0.3.2" }, "pathlib2": { "hashes": [ @@ -595,10 +610,10 @@ }, "pluggy": { "hashes": [ - "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095", - "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f" + "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616", + "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a" ], - "version": "==0.8.0" + "version": "==0.8.1" }, "prompt-toolkit": { "hashes": [ @@ -610,38 +625,38 @@ }, "psycopg2": { "hashes": [ - "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e", - "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237", - "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f", - "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300", - "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076", - "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221", - "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82", - "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92", - "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3", - "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6", - "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e", - "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae", - "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535", - "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f", - "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26", - "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7", - "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53", - "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b", - "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc", - "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee", - "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20", - "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d", - "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9", - "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244", - "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f", - "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b", - "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370", - "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9", - "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d", - "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0" - ], - "version": "==2.7.6.1" + "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94", + "sha256:0e9873e60f98f0c52339abf8f0339d1e22bfe5aae0bcf7aabd40c055175035ec", + "sha256:1148a5eb29073280bf9057c7fc45468592c1bb75a28f6df1591adb93c8cb63d0", + "sha256:259a8324e109d4922b0fcd046e223e289830e2568d6f4132a3702439e5fd532b", + "sha256:28dffa9ed4595429e61bacac41d3f9671bb613d1442ff43bcbec63d4f73ed5e8", + "sha256:314a74302d4737a3865d40ea50e430ce1543c921ba10f39d562e807cfe2edf2a", + "sha256:36b60201b6d215d7658a71493fdf6bd5e60ad9a0cffed39906627ff9f4f3afd3", + "sha256:3f9d532bce54c4234161176ff3b8688ff337575ca441ea27597e112dfcd0ee0c", + "sha256:5d222983847b40af989ad96c07fc3f07e47925e463baa5de716be8f805b41d9b", + "sha256:6757a6d2fc58f7d8f5d471ad180a0bd7b4dd3c7d681f051504fbea7ae29c8d6f", + "sha256:6a0e0f1e74edb0ab57d89680e59e7bfefad2bfbdf7c80eb38304d897d43674bb", + "sha256:6ca703ccdf734e886a1cf53eb702261110f6a8b0ed74bcad15f1399f74d3f189", + "sha256:8513b953d8f443c446aa79a4cc8a898bd415fc5e29349054f03a7d696d495542", + "sha256:9262a5ce2038570cb81b4d6413720484cb1bc52c064b2f36228d735b1f98b794", + "sha256:97441f851d862a0c844d981cbee7ee62566c322ebb3d68f86d66aa99d483985b", + "sha256:a07feade155eb8e69b54dd6774cf6acf2d936660c61d8123b8b6b1f9247b67d6", + "sha256:a9b9c02c91b1e3ec1f1886b2d0a90a0ea07cc529cb7e6e472b556bc20ce658f3", + "sha256:ae88216f94728d691b945983140bf40d51a1ff6c7fe57def93949bf9339ed54a", + "sha256:b360ffd17659491f1a6ad7c928350e229c7b7bd83a2b922b6ee541245c7a776f", + "sha256:b4221957ceccf14b2abdabef42d806e791350be10e21b260d7c9ce49012cc19e", + "sha256:b90758e49d5e6b152a460d10b92f8a6ccf318fcc0ee814dcf53f3a6fc5328789", + "sha256:c669ea986190ed05fb289d0c100cc88064351f2b85177cbfd3564c4f4847d18c", + "sha256:d1b61999d15c79cf7f4f7cc9021477aef35277fc52452cf50fd13b713c84424d", + "sha256:de7bb043d1adaaf46e38d47e7a5f703bb3dab01376111e522b07d25e1a79c1e1", + "sha256:e393568e288d884b94d263f2669215197840d097c7e5b0acd1a51c1ea7d1aba8", + "sha256:ed7e0849337bd37d89f2c2b0216a0de863399ee5d363d31b1e5330a99044737b", + "sha256:f153f71c3164665d269a5d03c7fa76ba675c7a8de9dc09a4e2c2cdc9936a7b41", + "sha256:f1fb5a8427af099beb7f65093cbdb52e021b8e6dbdfaf020402a623f4181baf5", + "sha256:f36b333e9f86a2fba960c72b90c34be6ca71819e300f7b1fc3d2b0f0b2c546cd", + "sha256:f4526d078aedd5187d0508aa5f9a01eae6a48a470ed678406da94b4cd6524b7e" + ], + "version": "==2.7.7" }, "ptyprocess": { "hashes": [ @@ -674,11 +689,11 @@ }, "pytest": { "hashes": [ - "sha256:3e65a22eb0d4f1bdbc1eacccf4a3198bf8d4049dea5112d70a0c61b00e748d02", - "sha256:5924060b374f62608a078494b909d341720a050b5224ff87e17e12377486a71d" + "sha256:41568ea7ecb4a68d7f63837cf65b92ce8d0105e43196ff2b26622995bb3dc4b2", + "sha256:c3c573a29d7c9547fb90217ece8a8843aa0c1328a797e200290dc3d0b4b823be" ], "index": "pypi", - "version": "==4.1.0" + "version": "==4.1.1" }, "pytest-cov": { "hashes": [ @@ -727,30 +742,30 @@ }, "typed-ast": { "hashes": [ - "sha256:0555eca1671ebe09eb5f2176723826f6f44cca5060502fea259de9b0e893ab53", - "sha256:0ca96128ea66163aea13911c9b4b661cb345eb729a20be15c034271360fc7474", - "sha256:16ccd06d614cf81b96de42a37679af12526ea25a208bce3da2d9226f44563868", - "sha256:1e21ae7b49a3f744958ffad1737dfbdb43e1137503ccc59f4e32c4ac33b0bd1c", - "sha256:37670c6fd857b5eb68aa5d193e14098354783b5138de482afa401cc2644f5a7f", - "sha256:46d84c8e3806619ece595aaf4f37743083f9454c9ea68a517f1daa05126daf1d", - "sha256:5b972bbb3819ece283a67358103cc6671da3646397b06e7acea558444daf54b2", - "sha256:6306ffa64922a7b58ee2e8d6f207813460ca5a90213b4a400c2e730375049246", - "sha256:6cb25dc95078931ecbd6cbcc4178d1b8ae8f2b513ae9c3bd0b7f81c2191db4c6", - "sha256:7e19d439fee23620dea6468d85bfe529b873dace39b7e5b0c82c7099681f8a22", - "sha256:7f5cd83af6b3ca9757e1127d852f497d11c7b09b4716c355acfbebf783d028da", - "sha256:81e885a713e06faeef37223a5b1167615db87f947ecc73f815b9d1bbd6b585be", - "sha256:94af325c9fe354019a29f9016277c547ad5d8a2d98a02806f27a7436b2da6735", - "sha256:b1e5445c6075f509d5764b84ce641a1535748801253b97f3b7ea9d948a22853a", - "sha256:cb061a959fec9a514d243831c514b51ccb940b58a5ce572a4e209810f2507dcf", - "sha256:cc8d0b703d573cbabe0d51c9d68ab68df42a81409e4ed6af45a04a95484b96a5", - "sha256:da0afa955865920edb146926455ec49da20965389982f91e926389666f5cf86a", - "sha256:dc76738331d61818ce0b90647aedde17bbba3d3f9e969d83c1d9087b4f978862", - "sha256:e7ec9a1445d27dbd0446568035f7106fa899a36f55e52ade28020f7b3845180d", - "sha256:f741ba03feb480061ab91a465d1a3ed2d40b52822ada5b4017770dfcb88f839f", - "sha256:fe800a58547dd424cd286b7270b967b5b3316b993d86453ede184a17b5a6b17d" + "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe", + "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c", + "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2", + "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a", + "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7", + "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827", + "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33", + "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9", + "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032", + "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9", + "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2", + "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2", + "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062", + "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15", + "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357", + "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a", + "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824", + "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442", + "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1", + "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2", + "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6" ], "markers": "python_version < '3.7' and implementation_name == 'cpython'", - "version": "==1.1.1" + "version": "==1.2.0" }, "urllib3": { "hashes": [ @@ -768,9 +783,9 @@ }, "wrapt": { "hashes": [ - "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" + "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533" ], - "version": "==1.10.11" + "version": "==1.11.1" } } } diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index b1fd7e68..2493b1ab 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -1,4 +1,5 @@ + import collections from fatcat_client import ReleaseEntity, ApiClient @@ -40,17 +41,29 @@ def release_to_elasticsearch(release): state = release.state, revision = release.revision, title = release.title, + original_title = release.original_title, release_type = release.release_type, release_status = release.release_status, language = release.language, + license = release.license_slug, doi = release.doi, pmid = release.pmid, pmcid = release.pmcid, isbn13 = release.isbn13, + wikidata_qid = release.wikidata_qid, core_id = release.core_id, - wikidata_qid = release.wikidata_qid + arxiv_id = release.core_id, + jstor_id = release.jstor_id, ) + is_oa = None + is_longtail_oa = None + in_kbart = None + in_web = False + in_dweb = False + in_ia = False + in_shadow = False + if release.release_date: # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) t['release_date'] = release.release_date.isoformat() @@ -59,52 +72,99 @@ def release_to_elasticsearch(release): if release.release_year is not None: t['release_year'] = release.release_year + t['any_abstract'] = len(release.abstracts) > 0 + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + container = release.container - container_is_kept = False if container: t['publisher'] = container.publisher t['container_name'] = container.name t['container_issnl'] = container.issnl - container_extra = container.extra - if container_extra: - t['container_is_oa'] = container_extra.get('is_oa') - container_is_kept = container_extra.get('is_kept', False) - t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + t['container_type'] = container.container_type + if container.extra: + if container.extra.get('is_oa') or container.extra.get('in_doaj'): + is_oa = True + if container.extra.get('in_kbart'): + # TODO: better KBART check goes here + in_kbart = True + if container.extra.get('ia'): + # TODO: container longtail check goes here + # TODO: sim/microfilm check goes here + pass + # TODO: SHERPA/Romeo goes here else: t['publisher'] = release.publisher files = release.files or [] t['file_count'] = len(files) - in_wa = False - in_ia = False - t['file_pdf_url'] = None + t['fileset_count'] = len(release.filesets or []) + t['webcapture_count'] = len(release.webcaptures or []) + any_pdf_url = None + good_pdf_url = None + best_pdf_url = None + ia_pdf_url = None for f in files: + if f.extra and f.extra.get('shadows'): + # TODO: shadow check goes here + in_shadows = True is_pdf = 'pdf' in (f.mimetype or '') for url in (f.urls or []): - if url.rel == 'webarchive': - in_wa = True - if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''): + if url.url.lower().startswith('http'): + in_web = True + if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + # TODO: not sure what rel will be + in_dweb = True + if is_pdf: + any_pdf_url = url.url + if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: + is_preserved = True + good_pdf_url = url.url + if '//web.archive.org/' in url.url or '//archive.org/' in url.url: in_ia = True if is_pdf: - t['file_pdf_url'] = url.url - if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url.url - t['file_in_webarchive'] = in_wa - t['file_in_ia'] = in_ia + best_pdf_url = url.url + ia_pdf_url = url.url + # here is where we bake-in priority; IA-specific + t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url + t['ia_pdf_url'] = ia_pdf_url + + if release.license_slug: + # TODO: more/better checks here, particularly strict *not* OA licenses + if release.license_slug.startswith("CC-"): + is_oa = True extra = release.extra or dict() if extra: - t['in_shadow'] = extra.get('in_shadow') - if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): - t['container_is_longtail_oa'] = True - t['any_abstract'] = len(release.abstracts) > 0 - t['is_kept'] = container_is_kept or extra.get('is_kept', False) + # TODO: longtail OA check from GROBID here + if extra.get('in_kbart'): + # NOTE: not actually setting this anywhere + in_kbart = True + if extra.get('is_oa'): + # NOTE: not actually setting this anywhere + is_oa = True + if extra.get('grobid'): + if not t.get('container_name'): + t['container_name'] = extra['grobid'].get('container_name') + if extra['grobid'].get('longtail_oa'): + is_longtail_oa = True + if extra.get('crossref'): + if extra['crossref'].get('archive'): + # all crossref archives are KBART, I believe + in_kbart = True - t['ref_count'] = len(release.refs or []) - t['contrib_count'] = len(release.contribs or []) - contrib_names = [] - for c in (release.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names + if is_longtail_oa: + is_oa = True + t['is_oa'] = is_oa + t['is_longtail_oa'] = is_longtail_oa + t['in_kbart'] = in_kbart + t['in_web'] = in_web + t['in_dweb'] = in_dweb + t['in_ia'] = in_ia + t['is_preserved'] = in_ia or in_kbart return t diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index a7826a73..193f78f6 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -78,7 +78,7 @@ def test_crossref_dict_parse(crossref_importer): assert r.contribs[0].raw_name == "Marcelo D. Radicioni" assert r.contribs[0].index == 0 assert r.contribs[1].raw_affiliation == "Some University" - assert r.contribs[1].extra['affiliations'] == ["Some Department"] + assert r.contribs[1].extra['more_affiliations'] == ["Some Department"] assert r.contribs[1].role == "author" assert r.contribs[3].role == "editor" assert r.contribs[3].index is None diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py index feb604ce..4fed4aaa 100644 --- a/python/tests/import_grobid_metadata.py +++ b/python/tests/import_grobid_metadata.py @@ -26,7 +26,8 @@ def test_grobid_metadata_parse(grobid_metadata_importer): print(re.contribs) assert re.contribs[0].raw_name == "Wahyu Ary" assert re.publisher == None - assert re.extra.get('container_name') == None + if re.extra: + assert re.extra.get('container_name') == None assert len(re.refs) == 27 def test_file_metadata_parse(grobid_metadata_importer): |