aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/elasticsearch/release_schema.json61
-rw-r--r--python/Pipfile.lock189
-rw-r--r--python/fatcat_tools/transforms.py120
-rw-r--r--python/tests/import_crossref.py2
-rw-r--r--python/tests/import_grobid_metadata.py3
5 files changed, 234 insertions, 141 deletions
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 9a712ad6..4cfe0894 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -31,39 +31,56 @@
"state": { "type": "keyword" },
"revision": { "type": "keyword" },
"title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "author": { "type": "alias", "path": "contrib_names" },
- "journal": { "type": "alias", "path": "container_name" },
- "date": { "type": "alias", "path": "release_date" },
- "year": { "type": "alias", "path": "release_year" },
- "issn": { "type": "alias", "path": "container_issnl" },
- "oa": { "type": "alias", "path": "container_is_oa" },
- "longtail": { "type": "alias", "path": "container_is_longtail_oa" },
+ "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
"release_date": { "type": "date" },
"release_year": { "type": "integer" },
"release_type": { "type": "keyword" },
"release_status": { "type": "keyword" },
- "language": { "type": "keyword" },
- "doi": { "type": "keyword" },
- "pmid": { "type": "keyword" },
- "pmcid": { "type": "keyword" },
- "isbn13": { "type": "keyword" },
- "core_id": { "type": "keyword" },
- "wikidata_qid": { "type": "keyword" },
+ "language": { "type": "keyword" },
+ "doi": { "type": "keyword" },
+ "pmid": { "type": "keyword" },
+ "pmcid": { "type": "keyword" },
+ "isbn13": { "type": "keyword" },
+ "wikidata_qid": { "type": "keyword" },
+ "core_id": { "type": "keyword" },
+ "axiv_id": { "type": "keyword" },
+ "jstor_id": { "type": "keyword" },
+ "license": { "type": "keyword" },
"publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
"container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
"container_issnl": { "type": "keyword" },
- "container_is_oa": { "type": "boolean" },
- "container_is_longtail_oa": { "type": "boolean" },
+ "container_type": { "type": "keyword" },
"contrib_count": { "type": "integer" },
- "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
"ref_count": { "type": "integer" },
"file_count": { "type": "integer" },
- "file_pdf_url": { "type": "keyword" },
- "file_in_webarchive": { "type": "boolean" },
- "file_in_ia": { "type": "boolean" },
+ "fileset_count": { "type": "integer" },
+ "webcapture_count": { "type": "integer" },
"any_abstract": { "type": "boolean" },
- "is_kept": { "type": "boolean" },
- "in_shadow": { "type": "boolean" }
+
+ "best_pdf_url": { "type": "keyword" },
+ "ia_pdf_url": { "type": "keyword" },
+ "is_oa": { "type": "boolean" },
+ "is_longtail_oa": { "type": "boolean" },
+ "is_preserved": { "type": "boolean" },
+ "in_kbart": { "type": "boolean" },
+ "in_jstor": { "type": "boolean" },
+ "in_dweb": { "type": "boolean" },
+ "in_web": { "type": "boolean" },
+ "in_ia": { "type": "boolean" },
+ "in_sim": { "type": "boolean" },
+ "in_shadows": { "type": "boolean" },
+
+ "author": { "type": "alias", "path": "contrib_names" },
+ "journal": { "type": "alias", "path": "container_name" },
+ "date": { "type": "alias", "path": "release_date" },
+ "year": { "type": "alias", "path": "release_year" },
+ "issn": { "type": "alias", "path": "container_issnl" },
+ "oa": { "type": "alias", "path": "is_oa" },
+ "longtail": { "type": "alias", "path": "is_longtail_oa" },
+ "lang": { "type": "alias", "path": "language" },
+ "file_pdf_url": { "type": "alias", "path": "best_pdf_url" },
+ "is_kept": { "type": "alias", "path": "in_kbart" }
}
}
}
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 296079f0..f2d39a99 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "c3deb49cf4c122c2aed3f4f944e9763cfcf40c85891ca3d3e9cabc3debbb9075"
+ "sha256": "8f98bb3f6a3083c8b03cb68d1ee48b25449a950dd8a9d15189f2eb4fae48f760"
},
"pipfile-spec": 6,
"requires": {
@@ -96,27 +96,27 @@
},
"cryptography": {
"hashes": [
- "sha256:05a6052c6a9f17ff78ba78f8e6eb1d777d25db3b763343a1ae89a7a8670386dd",
- "sha256:0eb83a24c650a36f68e31a6d0a70f7ad9c358fa2506dc7b683398b92e354a038",
- "sha256:0ff4a3d6ea86aa0c9e06e92a9f986de7ee8231f36c4da1b31c61a7e692ef3378",
- "sha256:1699f3e916981df32afdd014fb3164db28cdb61c757029f502cb0a8c29b2fdb3",
- "sha256:1b1f136d74f411f587b07c076149c4436a169dc19532e587460d9ced24adcc13",
- "sha256:21e63dd20f5e5455e8b34179ac43d95b3fb1ffa54d071fd2ed5d67da82cfe6dc",
- "sha256:2454ada8209bbde97065453a6ca488884bbb263e623d35ba183821317a58b46f",
- "sha256:3cdc5f7ca057b2214ce4569e01b0f368b3de9d8ee01887557755ccd1c15d9427",
- "sha256:418e7a5ec02a7056d3a4f0c0e7ea81df374205f25f4720bb0e84189aa5fd2515",
- "sha256:471a097076a7c4ab85561d7fa9a1239bd2ae1f9fd0047520f13d8b340bf3210b",
- "sha256:5ecaf9e7db3ca582c6de6229525d35db8a4e59dc3e8a40a331674ed90e658cbf",
- "sha256:63b064a074f8dc61be81449796e2c3f4e308b6eba04a241a5c9f2d05e882c681",
- "sha256:6afe324dfe6074822ccd56d80420df750e19ac30a4e56c925746c735cf22ae8b",
- "sha256:70596e90398574b77929cd87e1ac6e43edd0e29ba01e1365fed9c26bde295aa5",
- "sha256:70c2b04e905d3f72e2ba12c58a590817128dfca08949173faa19a42c824efa0b",
- "sha256:8908f1db90be48b060888e9c96a0dee9d842765ce9594ff6a23da61086116bb6",
- "sha256:af12dfc9874ac27ebe57fc28c8df0e8afa11f2a1025566476b0d50cdb8884f70",
- "sha256:b4fc04326b2d259ddd59ed8ea20405d2e695486ab4c5e1e49b025c484845206e",
- "sha256:da5b5dda4aa0d5e2b758cc8dfc67f8d4212e88ea9caad5f61ba132f948bab859"
- ],
- "version": "==2.4.2"
+ "sha256:05b3ded5e88747d28ee3ef493f2b92cbb947c1e45cf98cfef22e6d38bb67d4af",
+ "sha256:06826e7f72d1770e186e9c90e76b4f84d90cdb917b47ff88d8dc59a7b10e2b1e",
+ "sha256:08b753df3672b7066e74376f42ce8fc4683e4fd1358d34c80f502e939ee944d2",
+ "sha256:2cd29bd1911782baaee890544c653bb03ec7d95ebeb144d714b0f5c33deb55c7",
+ "sha256:31e5637e9036d966824edaa91bf0aa39dc6f525a1c599f39fd5c50340264e079",
+ "sha256:42fad67d7072216a49e34f923d8cbda9edacbf6633b19a79655e88a1b4857063",
+ "sha256:4946b67235b9d2ea7d31307be9d5ad5959d6c4a8f98f900157b47abddf698401",
+ "sha256:522fdb2809603ee97a4d0ef2f8d617bc791eb483313ba307cb9c0a773e5e5695",
+ "sha256:6f841c7272645dd7c65b07b7108adfa8af0aaea57f27b7f59e01d41f75444c85",
+ "sha256:7d335e35306af5b9bc0560ca39f740dfc8def72749645e193dd35be11fb323b3",
+ "sha256:8504661ffe324837f5c4607347eeee4cf0fcad689163c6e9c8d3b18cf1f4a4ad",
+ "sha256:9260b201ce584d7825d900c88700aa0bd6b40d4ebac7b213857bd2babee9dbca",
+ "sha256:9a30384cc402eac099210ab9b8801b2ae21e591831253883decdb4513b77a3cd",
+ "sha256:9e29af877c29338f0cab5f049ccc8bd3ead289a557f144376c4fbc7d1b98914f",
+ "sha256:ab50da871bc109b2d9389259aac269dd1b7c7413ee02d06fe4e486ed26882159",
+ "sha256:b13c80b877e73bcb6f012813c6f4a9334fcf4b0e96681c5a15dac578f2eedfa0",
+ "sha256:bfe66b577a7118e05b04141f0f1ed0959552d45672aa7ecb3d91e319d846001e",
+ "sha256:e091bd424567efa4b9d94287a952597c05d22155a13716bf5f9f746b9dc906d3",
+ "sha256:fa2b38c8519c5a3aa6e2b4e1cf1a549b54acda6adb25397ff542068e73d1ed00"
+ ],
+ "version": "==2.5"
},
"fatcat-client": {
"editable": true,
@@ -152,6 +152,14 @@
"index": "pypi",
"version": "==0.2"
},
+ "ftfy": {
+ "hashes": [
+ "sha256:84a1614190173bb447ac9d581e50185c6aa35b538754b6bedaba0cc0f83d8e80",
+ "sha256:fa74757fb7cb444366fa6a79c2feabd40281a44dfbf6eaed492a804764ee26b2"
+ ],
+ "index": "pypi",
+ "version": "==5.5.1"
+ },
"idna": {
"hashes": [
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
@@ -366,6 +374,13 @@
],
"version": "==1.24.1"
},
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ },
"werkzeug": {
"hashes": [
"sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c",
@@ -558,10 +573,10 @@
},
"parso": {
"hashes": [
- "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2",
- "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24"
+ "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d",
+ "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31"
],
- "version": "==0.3.1"
+ "version": "==0.3.2"
},
"pathlib2": {
"hashes": [
@@ -595,10 +610,10 @@
},
"pluggy": {
"hashes": [
- "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095",
- "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f"
+ "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
+ "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
],
- "version": "==0.8.0"
+ "version": "==0.8.1"
},
"prompt-toolkit": {
"hashes": [
@@ -610,38 +625,38 @@
},
"psycopg2": {
"hashes": [
- "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e",
- "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237",
- "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f",
- "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300",
- "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076",
- "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221",
- "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82",
- "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92",
- "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3",
- "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6",
- "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e",
- "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae",
- "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535",
- "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f",
- "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26",
- "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7",
- "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53",
- "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b",
- "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc",
- "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee",
- "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20",
- "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d",
- "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9",
- "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244",
- "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f",
- "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b",
- "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370",
- "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9",
- "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d",
- "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0"
- ],
- "version": "==2.7.6.1"
+ "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94",
+ "sha256:0e9873e60f98f0c52339abf8f0339d1e22bfe5aae0bcf7aabd40c055175035ec",
+ "sha256:1148a5eb29073280bf9057c7fc45468592c1bb75a28f6df1591adb93c8cb63d0",
+ "sha256:259a8324e109d4922b0fcd046e223e289830e2568d6f4132a3702439e5fd532b",
+ "sha256:28dffa9ed4595429e61bacac41d3f9671bb613d1442ff43bcbec63d4f73ed5e8",
+ "sha256:314a74302d4737a3865d40ea50e430ce1543c921ba10f39d562e807cfe2edf2a",
+ "sha256:36b60201b6d215d7658a71493fdf6bd5e60ad9a0cffed39906627ff9f4f3afd3",
+ "sha256:3f9d532bce54c4234161176ff3b8688ff337575ca441ea27597e112dfcd0ee0c",
+ "sha256:5d222983847b40af989ad96c07fc3f07e47925e463baa5de716be8f805b41d9b",
+ "sha256:6757a6d2fc58f7d8f5d471ad180a0bd7b4dd3c7d681f051504fbea7ae29c8d6f",
+ "sha256:6a0e0f1e74edb0ab57d89680e59e7bfefad2bfbdf7c80eb38304d897d43674bb",
+ "sha256:6ca703ccdf734e886a1cf53eb702261110f6a8b0ed74bcad15f1399f74d3f189",
+ "sha256:8513b953d8f443c446aa79a4cc8a898bd415fc5e29349054f03a7d696d495542",
+ "sha256:9262a5ce2038570cb81b4d6413720484cb1bc52c064b2f36228d735b1f98b794",
+ "sha256:97441f851d862a0c844d981cbee7ee62566c322ebb3d68f86d66aa99d483985b",
+ "sha256:a07feade155eb8e69b54dd6774cf6acf2d936660c61d8123b8b6b1f9247b67d6",
+ "sha256:a9b9c02c91b1e3ec1f1886b2d0a90a0ea07cc529cb7e6e472b556bc20ce658f3",
+ "sha256:ae88216f94728d691b945983140bf40d51a1ff6c7fe57def93949bf9339ed54a",
+ "sha256:b360ffd17659491f1a6ad7c928350e229c7b7bd83a2b922b6ee541245c7a776f",
+ "sha256:b4221957ceccf14b2abdabef42d806e791350be10e21b260d7c9ce49012cc19e",
+ "sha256:b90758e49d5e6b152a460d10b92f8a6ccf318fcc0ee814dcf53f3a6fc5328789",
+ "sha256:c669ea986190ed05fb289d0c100cc88064351f2b85177cbfd3564c4f4847d18c",
+ "sha256:d1b61999d15c79cf7f4f7cc9021477aef35277fc52452cf50fd13b713c84424d",
+ "sha256:de7bb043d1adaaf46e38d47e7a5f703bb3dab01376111e522b07d25e1a79c1e1",
+ "sha256:e393568e288d884b94d263f2669215197840d097c7e5b0acd1a51c1ea7d1aba8",
+ "sha256:ed7e0849337bd37d89f2c2b0216a0de863399ee5d363d31b1e5330a99044737b",
+ "sha256:f153f71c3164665d269a5d03c7fa76ba675c7a8de9dc09a4e2c2cdc9936a7b41",
+ "sha256:f1fb5a8427af099beb7f65093cbdb52e021b8e6dbdfaf020402a623f4181baf5",
+ "sha256:f36b333e9f86a2fba960c72b90c34be6ca71819e300f7b1fc3d2b0f0b2c546cd",
+ "sha256:f4526d078aedd5187d0508aa5f9a01eae6a48a470ed678406da94b4cd6524b7e"
+ ],
+ "version": "==2.7.7"
},
"ptyprocess": {
"hashes": [
@@ -674,11 +689,11 @@
},
"pytest": {
"hashes": [
- "sha256:3e65a22eb0d4f1bdbc1eacccf4a3198bf8d4049dea5112d70a0c61b00e748d02",
- "sha256:5924060b374f62608a078494b909d341720a050b5224ff87e17e12377486a71d"
+ "sha256:41568ea7ecb4a68d7f63837cf65b92ce8d0105e43196ff2b26622995bb3dc4b2",
+ "sha256:c3c573a29d7c9547fb90217ece8a8843aa0c1328a797e200290dc3d0b4b823be"
],
"index": "pypi",
- "version": "==4.1.0"
+ "version": "==4.1.1"
},
"pytest-cov": {
"hashes": [
@@ -727,30 +742,30 @@
},
"typed-ast": {
"hashes": [
- "sha256:0555eca1671ebe09eb5f2176723826f6f44cca5060502fea259de9b0e893ab53",
- "sha256:0ca96128ea66163aea13911c9b4b661cb345eb729a20be15c034271360fc7474",
- "sha256:16ccd06d614cf81b96de42a37679af12526ea25a208bce3da2d9226f44563868",
- "sha256:1e21ae7b49a3f744958ffad1737dfbdb43e1137503ccc59f4e32c4ac33b0bd1c",
- "sha256:37670c6fd857b5eb68aa5d193e14098354783b5138de482afa401cc2644f5a7f",
- "sha256:46d84c8e3806619ece595aaf4f37743083f9454c9ea68a517f1daa05126daf1d",
- "sha256:5b972bbb3819ece283a67358103cc6671da3646397b06e7acea558444daf54b2",
- "sha256:6306ffa64922a7b58ee2e8d6f207813460ca5a90213b4a400c2e730375049246",
- "sha256:6cb25dc95078931ecbd6cbcc4178d1b8ae8f2b513ae9c3bd0b7f81c2191db4c6",
- "sha256:7e19d439fee23620dea6468d85bfe529b873dace39b7e5b0c82c7099681f8a22",
- "sha256:7f5cd83af6b3ca9757e1127d852f497d11c7b09b4716c355acfbebf783d028da",
- "sha256:81e885a713e06faeef37223a5b1167615db87f947ecc73f815b9d1bbd6b585be",
- "sha256:94af325c9fe354019a29f9016277c547ad5d8a2d98a02806f27a7436b2da6735",
- "sha256:b1e5445c6075f509d5764b84ce641a1535748801253b97f3b7ea9d948a22853a",
- "sha256:cb061a959fec9a514d243831c514b51ccb940b58a5ce572a4e209810f2507dcf",
- "sha256:cc8d0b703d573cbabe0d51c9d68ab68df42a81409e4ed6af45a04a95484b96a5",
- "sha256:da0afa955865920edb146926455ec49da20965389982f91e926389666f5cf86a",
- "sha256:dc76738331d61818ce0b90647aedde17bbba3d3f9e969d83c1d9087b4f978862",
- "sha256:e7ec9a1445d27dbd0446568035f7106fa899a36f55e52ade28020f7b3845180d",
- "sha256:f741ba03feb480061ab91a465d1a3ed2d40b52822ada5b4017770dfcb88f839f",
- "sha256:fe800a58547dd424cd286b7270b967b5b3316b993d86453ede184a17b5a6b17d"
+ "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe",
+ "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c",
+ "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2",
+ "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a",
+ "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7",
+ "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827",
+ "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33",
+ "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9",
+ "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032",
+ "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9",
+ "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2",
+ "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2",
+ "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062",
+ "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15",
+ "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357",
+ "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a",
+ "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824",
+ "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442",
+ "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1",
+ "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2",
+ "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6"
],
"markers": "python_version < '3.7' and implementation_name == 'cpython'",
- "version": "==1.1.1"
+ "version": "==1.2.0"
},
"urllib3": {
"hashes": [
@@ -768,9 +783,9 @@
},
"wrapt": {
"hashes": [
- "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
+ "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533"
],
- "version": "==1.10.11"
+ "version": "==1.11.1"
}
}
}
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index b1fd7e68..2493b1ab 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -1,4 +1,5 @@
+
import collections
from fatcat_client import ReleaseEntity, ApiClient
@@ -40,17 +41,29 @@ def release_to_elasticsearch(release):
state = release.state,
revision = release.revision,
title = release.title,
+ original_title = release.original_title,
release_type = release.release_type,
release_status = release.release_status,
language = release.language,
+ license = release.license_slug,
doi = release.doi,
pmid = release.pmid,
pmcid = release.pmcid,
isbn13 = release.isbn13,
+ wikidata_qid = release.wikidata_qid,
core_id = release.core_id,
- wikidata_qid = release.wikidata_qid
+ arxiv_id = release.core_id,
+ jstor_id = release.jstor_id,
)
+ is_oa = None
+ is_longtail_oa = None
+ in_kbart = None
+ in_web = False
+ in_dweb = False
+ in_ia = False
+ in_shadow = False
+
if release.release_date:
# .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
t['release_date'] = release.release_date.isoformat()
@@ -59,52 +72,99 @@ def release_to_elasticsearch(release):
if release.release_year is not None:
t['release_year'] = release.release_year
+ t['any_abstract'] = len(release.abstracts) > 0
+ t['ref_count'] = len(release.refs or [])
+ t['contrib_count'] = len(release.contribs or [])
+ contrib_names = []
+ for c in (release.contribs or []):
+ if c.raw_name:
+ contrib_names.append(c.raw_name)
+ t['contrib_names'] = contrib_names
+
container = release.container
- container_is_kept = False
if container:
t['publisher'] = container.publisher
t['container_name'] = container.name
t['container_issnl'] = container.issnl
- container_extra = container.extra
- if container_extra:
- t['container_is_oa'] = container_extra.get('is_oa')
- container_is_kept = container_extra.get('is_kept', False)
- t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+ t['container_type'] = container.container_type
+ if container.extra:
+ if container.extra.get('is_oa') or container.extra.get('in_doaj'):
+ is_oa = True
+ if container.extra.get('in_kbart'):
+ # TODO: better KBART check goes here
+ in_kbart = True
+ if container.extra.get('ia'):
+ # TODO: container longtail check goes here
+ # TODO: sim/microfilm check goes here
+ pass
+ # TODO: SHERPA/Romeo goes here
else:
t['publisher'] = release.publisher
files = release.files or []
t['file_count'] = len(files)
- in_wa = False
- in_ia = False
- t['file_pdf_url'] = None
+ t['fileset_count'] = len(release.filesets or [])
+ t['webcapture_count'] = len(release.webcaptures or [])
+ any_pdf_url = None
+ good_pdf_url = None
+ best_pdf_url = None
+ ia_pdf_url = None
for f in files:
+ if f.extra and f.extra.get('shadows'):
+ # TODO: shadow check goes here
+ in_shadows = True
is_pdf = 'pdf' in (f.mimetype or '')
for url in (f.urls or []):
- if url.rel == 'webarchive':
- in_wa = True
- if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''):
+ if url.url.lower().startswith('http'):
+ in_web = True
+ if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+ # TODO: not sure what rel will be
+ in_dweb = True
+ if is_pdf:
+ any_pdf_url = url.url
+ if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
+ is_preserved = True
+ good_pdf_url = url.url
+ if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
in_ia = True
if is_pdf:
- t['file_pdf_url'] = url.url
- if not t['file_pdf_url'] and is_pdf:
- t['file_pdf_url'] = url.url
- t['file_in_webarchive'] = in_wa
- t['file_in_ia'] = in_ia
+ best_pdf_url = url.url
+ ia_pdf_url = url.url
+ # here is where we bake-in priority; IA-specific
+ t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+ t['ia_pdf_url'] = ia_pdf_url
+
+ if release.license_slug:
+ # TODO: more/better checks here, particularly strict *not* OA licenses
+ if release.license_slug.startswith("CC-"):
+ is_oa = True
extra = release.extra or dict()
if extra:
- t['in_shadow'] = extra.get('in_shadow')
- if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
- t['container_is_longtail_oa'] = True
- t['any_abstract'] = len(release.abstracts) > 0
- t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+ # TODO: longtail OA check from GROBID here
+ if extra.get('in_kbart'):
+ # NOTE: not actually setting this anywhere
+ in_kbart = True
+ if extra.get('is_oa'):
+ # NOTE: not actually setting this anywhere
+ is_oa = True
+ if extra.get('grobid'):
+ if not t.get('container_name'):
+ t['container_name'] = extra['grobid'].get('container_name')
+ if extra['grobid'].get('longtail_oa'):
+ is_longtail_oa = True
+ if extra.get('crossref'):
+ if extra['crossref'].get('archive'):
+ # all crossref archives are KBART, I believe
+ in_kbart = True
- t['ref_count'] = len(release.refs or [])
- t['contrib_count'] = len(release.contribs or [])
- contrib_names = []
- for c in (release.contribs or []):
- if c.raw_name:
- contrib_names.append(c.raw_name)
- t['contrib_names'] = contrib_names
+ if is_longtail_oa:
+ is_oa = True
+ t['is_oa'] = is_oa
+ t['is_longtail_oa'] = is_longtail_oa
+ t['in_kbart'] = in_kbart
+ t['in_web'] = in_web
+ t['in_dweb'] = in_dweb
+ t['in_ia'] = in_ia
+ t['is_preserved'] = in_ia or in_kbart
return t
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index a7826a73..193f78f6 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -78,7 +78,7 @@ def test_crossref_dict_parse(crossref_importer):
assert r.contribs[0].raw_name == "Marcelo D. Radicioni"
assert r.contribs[0].index == 0
assert r.contribs[1].raw_affiliation == "Some University"
- assert r.contribs[1].extra['affiliations'] == ["Some Department"]
+ assert r.contribs[1].extra['more_affiliations'] == ["Some Department"]
assert r.contribs[1].role == "author"
assert r.contribs[3].role == "editor"
assert r.contribs[3].index is None
diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py
index feb604ce..4fed4aaa 100644
--- a/python/tests/import_grobid_metadata.py
+++ b/python/tests/import_grobid_metadata.py
@@ -26,7 +26,8 @@ def test_grobid_metadata_parse(grobid_metadata_importer):
print(re.contribs)
assert re.contribs[0].raw_name == "Wahyu Ary"
assert re.publisher == None
- assert re.extra.get('container_name') == None
+ if re.extra:
+ assert re.extra.get('container_name') == None
assert len(re.refs) == 27
def test_file_metadata_parse(grobid_metadata_importer):