diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/Pipfile.lock | 189 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms.py | 120 | ||||
| -rw-r--r-- | python/tests/import_crossref.py | 2 | ||||
| -rw-r--r-- | python/tests/import_grobid_metadata.py | 3 | 
4 files changed, 195 insertions, 119 deletions
| diff --git a/python/Pipfile.lock b/python/Pipfile.lock index 296079f0..f2d39a99 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@  {      "_meta": {          "hash": { -            "sha256": "c3deb49cf4c122c2aed3f4f944e9763cfcf40c85891ca3d3e9cabc3debbb9075" +            "sha256": "8f98bb3f6a3083c8b03cb68d1ee48b25449a950dd8a9d15189f2eb4fae48f760"          },          "pipfile-spec": 6,          "requires": { @@ -96,27 +96,27 @@          },          "cryptography": {              "hashes": [ -                "sha256:05a6052c6a9f17ff78ba78f8e6eb1d777d25db3b763343a1ae89a7a8670386dd", -                "sha256:0eb83a24c650a36f68e31a6d0a70f7ad9c358fa2506dc7b683398b92e354a038", -                "sha256:0ff4a3d6ea86aa0c9e06e92a9f986de7ee8231f36c4da1b31c61a7e692ef3378", -                "sha256:1699f3e916981df32afdd014fb3164db28cdb61c757029f502cb0a8c29b2fdb3", -                "sha256:1b1f136d74f411f587b07c076149c4436a169dc19532e587460d9ced24adcc13", -                "sha256:21e63dd20f5e5455e8b34179ac43d95b3fb1ffa54d071fd2ed5d67da82cfe6dc", -                "sha256:2454ada8209bbde97065453a6ca488884bbb263e623d35ba183821317a58b46f", -                "sha256:3cdc5f7ca057b2214ce4569e01b0f368b3de9d8ee01887557755ccd1c15d9427", -                "sha256:418e7a5ec02a7056d3a4f0c0e7ea81df374205f25f4720bb0e84189aa5fd2515", -                "sha256:471a097076a7c4ab85561d7fa9a1239bd2ae1f9fd0047520f13d8b340bf3210b", -                "sha256:5ecaf9e7db3ca582c6de6229525d35db8a4e59dc3e8a40a331674ed90e658cbf", -                "sha256:63b064a074f8dc61be81449796e2c3f4e308b6eba04a241a5c9f2d05e882c681", -                "sha256:6afe324dfe6074822ccd56d80420df750e19ac30a4e56c925746c735cf22ae8b", -                "sha256:70596e90398574b77929cd87e1ac6e43edd0e29ba01e1365fed9c26bde295aa5", -                "sha256:70c2b04e905d3f72e2ba12c58a590817128dfca08949173faa19a42c824efa0b", -                "sha256:8908f1db90be48b060888e9c96a0dee9d842765ce9594ff6a23da61086116bb6", -                "sha256:af12dfc9874ac27ebe57fc28c8df0e8afa11f2a1025566476b0d50cdb8884f70", -                "sha256:b4fc04326b2d259ddd59ed8ea20405d2e695486ab4c5e1e49b025c484845206e", -                "sha256:da5b5dda4aa0d5e2b758cc8dfc67f8d4212e88ea9caad5f61ba132f948bab859" -            ], -            "version": "==2.4.2" +                "sha256:05b3ded5e88747d28ee3ef493f2b92cbb947c1e45cf98cfef22e6d38bb67d4af", +                "sha256:06826e7f72d1770e186e9c90e76b4f84d90cdb917b47ff88d8dc59a7b10e2b1e", +                "sha256:08b753df3672b7066e74376f42ce8fc4683e4fd1358d34c80f502e939ee944d2", +                "sha256:2cd29bd1911782baaee890544c653bb03ec7d95ebeb144d714b0f5c33deb55c7", +                "sha256:31e5637e9036d966824edaa91bf0aa39dc6f525a1c599f39fd5c50340264e079", +                "sha256:42fad67d7072216a49e34f923d8cbda9edacbf6633b19a79655e88a1b4857063", +                "sha256:4946b67235b9d2ea7d31307be9d5ad5959d6c4a8f98f900157b47abddf698401", +                "sha256:522fdb2809603ee97a4d0ef2f8d617bc791eb483313ba307cb9c0a773e5e5695", +                "sha256:6f841c7272645dd7c65b07b7108adfa8af0aaea57f27b7f59e01d41f75444c85", +                "sha256:7d335e35306af5b9bc0560ca39f740dfc8def72749645e193dd35be11fb323b3", +                "sha256:8504661ffe324837f5c4607347eeee4cf0fcad689163c6e9c8d3b18cf1f4a4ad", +                "sha256:9260b201ce584d7825d900c88700aa0bd6b40d4ebac7b213857bd2babee9dbca", +                "sha256:9a30384cc402eac099210ab9b8801b2ae21e591831253883decdb4513b77a3cd", +                "sha256:9e29af877c29338f0cab5f049ccc8bd3ead289a557f144376c4fbc7d1b98914f", +                "sha256:ab50da871bc109b2d9389259aac269dd1b7c7413ee02d06fe4e486ed26882159", +                "sha256:b13c80b877e73bcb6f012813c6f4a9334fcf4b0e96681c5a15dac578f2eedfa0", +                "sha256:bfe66b577a7118e05b04141f0f1ed0959552d45672aa7ecb3d91e319d846001e", +                "sha256:e091bd424567efa4b9d94287a952597c05d22155a13716bf5f9f746b9dc906d3", +                "sha256:fa2b38c8519c5a3aa6e2b4e1cf1a549b54acda6adb25397ff542068e73d1ed00" +            ], +            "version": "==2.5"          },          "fatcat-client": {              "editable": true, @@ -152,6 +152,14 @@              "index": "pypi",              "version": "==0.2"          }, +        "ftfy": { +            "hashes": [ +                "sha256:84a1614190173bb447ac9d581e50185c6aa35b538754b6bedaba0cc0f83d8e80", +                "sha256:fa74757fb7cb444366fa6a79c2feabd40281a44dfbf6eaed492a804764ee26b2" +            ], +            "index": "pypi", +            "version": "==5.5.1" +        },          "idna": {              "hashes": [                  "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", @@ -366,6 +374,13 @@              ],              "version": "==1.24.1"          }, +        "wcwidth": { +            "hashes": [ +                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", +                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" +            ], +            "version": "==0.1.7" +        },          "werkzeug": {              "hashes": [                  "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", @@ -558,10 +573,10 @@          },          "parso": {              "hashes": [ -                "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", -                "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" +                "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d", +                "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31"              ], -            "version": "==0.3.1" +            "version": "==0.3.2"          },          "pathlib2": {              "hashes": [ @@ -595,10 +610,10 @@          },          "pluggy": {              "hashes": [ -                "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095", -                "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f" +                "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616", +                "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"              ], -            "version": "==0.8.0" +            "version": "==0.8.1"          },          "prompt-toolkit": {              "hashes": [ @@ -610,38 +625,38 @@          },          "psycopg2": {              "hashes": [ -                "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e", -                "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237", -                "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f", -                "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300", -                "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076", -                "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221", -                "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82", -                "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92", -                "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3", -                "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6", -                "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e", -                "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae", -                "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535", -                "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f", -                "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26", -                "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7", -                "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53", -                "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b", -                "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc", -                "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee", -                "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20", -                "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d", -                "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9", -                "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244", -                "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f", -                "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b", -                "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370", -                "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9", -                "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d", -                "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0" -            ], -            "version": "==2.7.6.1" +                "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94", +                "sha256:0e9873e60f98f0c52339abf8f0339d1e22bfe5aae0bcf7aabd40c055175035ec", +                "sha256:1148a5eb29073280bf9057c7fc45468592c1bb75a28f6df1591adb93c8cb63d0", +                "sha256:259a8324e109d4922b0fcd046e223e289830e2568d6f4132a3702439e5fd532b", +                "sha256:28dffa9ed4595429e61bacac41d3f9671bb613d1442ff43bcbec63d4f73ed5e8", +                "sha256:314a74302d4737a3865d40ea50e430ce1543c921ba10f39d562e807cfe2edf2a", +                "sha256:36b60201b6d215d7658a71493fdf6bd5e60ad9a0cffed39906627ff9f4f3afd3", +                "sha256:3f9d532bce54c4234161176ff3b8688ff337575ca441ea27597e112dfcd0ee0c", +                "sha256:5d222983847b40af989ad96c07fc3f07e47925e463baa5de716be8f805b41d9b", +                "sha256:6757a6d2fc58f7d8f5d471ad180a0bd7b4dd3c7d681f051504fbea7ae29c8d6f", +                "sha256:6a0e0f1e74edb0ab57d89680e59e7bfefad2bfbdf7c80eb38304d897d43674bb", +                "sha256:6ca703ccdf734e886a1cf53eb702261110f6a8b0ed74bcad15f1399f74d3f189", +                "sha256:8513b953d8f443c446aa79a4cc8a898bd415fc5e29349054f03a7d696d495542", +                "sha256:9262a5ce2038570cb81b4d6413720484cb1bc52c064b2f36228d735b1f98b794", +                "sha256:97441f851d862a0c844d981cbee7ee62566c322ebb3d68f86d66aa99d483985b", +                "sha256:a07feade155eb8e69b54dd6774cf6acf2d936660c61d8123b8b6b1f9247b67d6", +                "sha256:a9b9c02c91b1e3ec1f1886b2d0a90a0ea07cc529cb7e6e472b556bc20ce658f3", +                "sha256:ae88216f94728d691b945983140bf40d51a1ff6c7fe57def93949bf9339ed54a", +                "sha256:b360ffd17659491f1a6ad7c928350e229c7b7bd83a2b922b6ee541245c7a776f", +                "sha256:b4221957ceccf14b2abdabef42d806e791350be10e21b260d7c9ce49012cc19e", +                "sha256:b90758e49d5e6b152a460d10b92f8a6ccf318fcc0ee814dcf53f3a6fc5328789", +                "sha256:c669ea986190ed05fb289d0c100cc88064351f2b85177cbfd3564c4f4847d18c", +                "sha256:d1b61999d15c79cf7f4f7cc9021477aef35277fc52452cf50fd13b713c84424d", +                "sha256:de7bb043d1adaaf46e38d47e7a5f703bb3dab01376111e522b07d25e1a79c1e1", +                "sha256:e393568e288d884b94d263f2669215197840d097c7e5b0acd1a51c1ea7d1aba8", +                "sha256:ed7e0849337bd37d89f2c2b0216a0de863399ee5d363d31b1e5330a99044737b", +                "sha256:f153f71c3164665d269a5d03c7fa76ba675c7a8de9dc09a4e2c2cdc9936a7b41", +                "sha256:f1fb5a8427af099beb7f65093cbdb52e021b8e6dbdfaf020402a623f4181baf5", +                "sha256:f36b333e9f86a2fba960c72b90c34be6ca71819e300f7b1fc3d2b0f0b2c546cd", +                "sha256:f4526d078aedd5187d0508aa5f9a01eae6a48a470ed678406da94b4cd6524b7e" +            ], +            "version": "==2.7.7"          },          "ptyprocess": {              "hashes": [ @@ -674,11 +689,11 @@          },          "pytest": {              "hashes": [ -                "sha256:3e65a22eb0d4f1bdbc1eacccf4a3198bf8d4049dea5112d70a0c61b00e748d02", -                "sha256:5924060b374f62608a078494b909d341720a050b5224ff87e17e12377486a71d" +                "sha256:41568ea7ecb4a68d7f63837cf65b92ce8d0105e43196ff2b26622995bb3dc4b2", +                "sha256:c3c573a29d7c9547fb90217ece8a8843aa0c1328a797e200290dc3d0b4b823be"              ],              "index": "pypi", -            "version": "==4.1.0" +            "version": "==4.1.1"          },          "pytest-cov": {              "hashes": [ @@ -727,30 +742,30 @@          },          "typed-ast": {              "hashes": [ -                "sha256:0555eca1671ebe09eb5f2176723826f6f44cca5060502fea259de9b0e893ab53", -                "sha256:0ca96128ea66163aea13911c9b4b661cb345eb729a20be15c034271360fc7474", -                "sha256:16ccd06d614cf81b96de42a37679af12526ea25a208bce3da2d9226f44563868", -                "sha256:1e21ae7b49a3f744958ffad1737dfbdb43e1137503ccc59f4e32c4ac33b0bd1c", -                "sha256:37670c6fd857b5eb68aa5d193e14098354783b5138de482afa401cc2644f5a7f", -                "sha256:46d84c8e3806619ece595aaf4f37743083f9454c9ea68a517f1daa05126daf1d", -                "sha256:5b972bbb3819ece283a67358103cc6671da3646397b06e7acea558444daf54b2", -                "sha256:6306ffa64922a7b58ee2e8d6f207813460ca5a90213b4a400c2e730375049246", -                "sha256:6cb25dc95078931ecbd6cbcc4178d1b8ae8f2b513ae9c3bd0b7f81c2191db4c6", -                "sha256:7e19d439fee23620dea6468d85bfe529b873dace39b7e5b0c82c7099681f8a22", -                "sha256:7f5cd83af6b3ca9757e1127d852f497d11c7b09b4716c355acfbebf783d028da", -                "sha256:81e885a713e06faeef37223a5b1167615db87f947ecc73f815b9d1bbd6b585be", -                "sha256:94af325c9fe354019a29f9016277c547ad5d8a2d98a02806f27a7436b2da6735", -                "sha256:b1e5445c6075f509d5764b84ce641a1535748801253b97f3b7ea9d948a22853a", -                "sha256:cb061a959fec9a514d243831c514b51ccb940b58a5ce572a4e209810f2507dcf", -                "sha256:cc8d0b703d573cbabe0d51c9d68ab68df42a81409e4ed6af45a04a95484b96a5", -                "sha256:da0afa955865920edb146926455ec49da20965389982f91e926389666f5cf86a", -                "sha256:dc76738331d61818ce0b90647aedde17bbba3d3f9e969d83c1d9087b4f978862", -                "sha256:e7ec9a1445d27dbd0446568035f7106fa899a36f55e52ade28020f7b3845180d", -                "sha256:f741ba03feb480061ab91a465d1a3ed2d40b52822ada5b4017770dfcb88f839f", -                "sha256:fe800a58547dd424cd286b7270b967b5b3316b993d86453ede184a17b5a6b17d" +                "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe", +                "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c", +                "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2", +                "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a", +                "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7", +                "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827", +                "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33", +                "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9", +                "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032", +                "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9", +                "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2", +                "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2", +                "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062", +                "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15", +                "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357", +                "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a", +                "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824", +                "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442", +                "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1", +                "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2", +                "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6"              ],              "markers": "python_version < '3.7' and implementation_name == 'cpython'", -            "version": "==1.1.1" +            "version": "==1.2.0"          },          "urllib3": {              "hashes": [ @@ -768,9 +783,9 @@          },          "wrapt": {              "hashes": [ -                "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" +                "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533"              ], -            "version": "==1.10.11" +            "version": "==1.11.1"          }      }  } diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index b1fd7e68..2493b1ab 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -1,4 +1,5 @@ +  import collections  from fatcat_client import ReleaseEntity, ApiClient @@ -40,17 +41,29 @@ def release_to_elasticsearch(release):          state = release.state,          revision = release.revision,          title = release.title, +        original_title = release.original_title,          release_type = release.release_type,          release_status = release.release_status,          language = release.language, +        license = release.license_slug,          doi = release.doi,          pmid = release.pmid,          pmcid = release.pmcid,          isbn13 = release.isbn13, +        wikidata_qid = release.wikidata_qid,          core_id = release.core_id, -        wikidata_qid = release.wikidata_qid +        arxiv_id = release.core_id, +        jstor_id = release.jstor_id,      ) +    is_oa = None +    is_longtail_oa = None +    in_kbart = None +    in_web = False +    in_dweb = False +    in_ia = False +    in_shadow = False +      if release.release_date:          # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)          t['release_date'] = release.release_date.isoformat() @@ -59,52 +72,99 @@ def release_to_elasticsearch(release):      if release.release_year is not None:          t['release_year'] = release.release_year +    t['any_abstract'] = len(release.abstracts) > 0 +    t['ref_count'] = len(release.refs or []) +    t['contrib_count'] = len(release.contribs or []) +    contrib_names = [] +    for c in (release.contribs or []): +        if c.raw_name: +            contrib_names.append(c.raw_name) +    t['contrib_names'] = contrib_names +      container = release.container -    container_is_kept = False      if container:          t['publisher'] = container.publisher          t['container_name'] = container.name          t['container_issnl'] = container.issnl -        container_extra = container.extra -        if container_extra: -            t['container_is_oa'] = container_extra.get('is_oa') -            container_is_kept = container_extra.get('is_kept', False) -            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') +        t['container_type'] = container.container_type +        if container.extra: +            if container.extra.get('is_oa') or container.extra.get('in_doaj'): +                is_oa = True +            if container.extra.get('in_kbart'): +                # TODO: better KBART check goes here +                in_kbart = True +            if container.extra.get('ia'): +                # TODO: container longtail check goes here +                # TODO: sim/microfilm check goes here +                pass +            # TODO: SHERPA/Romeo goes here      else:          t['publisher'] = release.publisher      files = release.files or []      t['file_count'] = len(files) -    in_wa = False -    in_ia = False -    t['file_pdf_url'] = None +    t['fileset_count'] = len(release.filesets or []) +    t['webcapture_count'] = len(release.webcaptures or []) +    any_pdf_url = None +    good_pdf_url = None +    best_pdf_url = None +    ia_pdf_url = None      for f in files: +        if f.extra and f.extra.get('shadows'): +            # TODO: shadow check goes here +            in_shadows = True          is_pdf = 'pdf' in (f.mimetype or '')          for url in (f.urls or []): -            if url.rel == 'webarchive': -                in_wa = True -            if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''): +            if url.url.lower().startswith('http'): +                in_web = True +            if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): +                # TODO: not sure what rel will be +                in_dweb = True +            if is_pdf: +                any_pdf_url = url.url +            if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: +                is_preserved = True +                good_pdf_url = url.url +            if '//web.archive.org/' in url.url or '//archive.org/' in url.url:                  in_ia = True                  if is_pdf: -                    t['file_pdf_url'] = url.url -            if not t['file_pdf_url'] and is_pdf: -                t['file_pdf_url'] = url.url -    t['file_in_webarchive'] = in_wa -    t['file_in_ia'] = in_ia +                    best_pdf_url = url.url +                    ia_pdf_url = url.url +    # here is where we bake-in priority; IA-specific +    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url +    t['ia_pdf_url'] = ia_pdf_url + +    if release.license_slug: +        # TODO: more/better checks here, particularly strict *not* OA licenses +        if release.license_slug.startswith("CC-"): +            is_oa = True      extra = release.extra or dict()      if extra: -        t['in_shadow'] = extra.get('in_shadow') -        if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): -            t['container_is_longtail_oa'] = True -    t['any_abstract'] = len(release.abstracts) > 0 -    t['is_kept'] = container_is_kept or extra.get('is_kept', False) +        # TODO: longtail OA check from GROBID here +        if extra.get('in_kbart'): +            # NOTE: not actually setting this anywhere +            in_kbart = True +        if extra.get('is_oa'): +            # NOTE: not actually setting this anywhere +            is_oa = True +        if extra.get('grobid'): +            if not t.get('container_name'): +                t['container_name'] = extra['grobid'].get('container_name') +            if extra['grobid'].get('longtail_oa'): +                is_longtail_oa = True +        if extra.get('crossref'): +            if extra['crossref'].get('archive'): +                # all crossref archives are KBART, I believe +                in_kbart = True -    t['ref_count'] = len(release.refs or []) -    t['contrib_count'] = len(release.contribs or []) -    contrib_names = [] -    for c in (release.contribs or []): -        if c.raw_name: -            contrib_names.append(c.raw_name) -    t['contrib_names'] = contrib_names +    if is_longtail_oa: +        is_oa = True +    t['is_oa'] = is_oa +    t['is_longtail_oa'] = is_longtail_oa +    t['in_kbart'] = in_kbart +    t['in_web'] = in_web +    t['in_dweb'] = in_dweb +    t['in_ia'] = in_ia +    t['is_preserved'] = in_ia or in_kbart      return t diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index a7826a73..193f78f6 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -78,7 +78,7 @@ def test_crossref_dict_parse(crossref_importer):          assert r.contribs[0].raw_name == "Marcelo D. Radicioni"          assert r.contribs[0].index == 0          assert r.contribs[1].raw_affiliation == "Some University" -        assert r.contribs[1].extra['affiliations'] == ["Some Department"] +        assert r.contribs[1].extra['more_affiliations'] == ["Some Department"]          assert r.contribs[1].role == "author"          assert r.contribs[3].role == "editor"          assert r.contribs[3].index is None diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py index feb604ce..4fed4aaa 100644 --- a/python/tests/import_grobid_metadata.py +++ b/python/tests/import_grobid_metadata.py @@ -26,7 +26,8 @@ def test_grobid_metadata_parse(grobid_metadata_importer):          print(re.contribs)          assert re.contribs[0].raw_name == "Wahyu Ary"          assert re.publisher == None -        assert re.extra.get('container_name') == None +        if re.extra: +            assert re.extra.get('container_name') == None          assert len(re.refs) == 27  def test_file_metadata_parse(grobid_metadata_importer): | 
