diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-29 21:52:33 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-29 21:59:05 -0800 |
commit | 5d458a3df7e58e6551d8ec72979e376c62fdd2f7 (patch) | |
tree | 4ebbaa95151c94817cb13d4e7a8e770c804694c7 /python/tests/transform_elasticsearch.py | |
parent | e047fbe1a9c495e86a6757d44eb32c9109a1b753 (diff) | |
download | fatcat-5d458a3df7e58e6551d8ec72979e376c62fdd2f7.tar.gz fatcat-5d458a3df7e58e6551d8ec72979e376c62fdd2f7.zip |
fix some transform bugs, add some tests
Diffstat (limited to 'python/tests/transform_elasticsearch.py')
-rw-r--r-- | python/tests/transform_elasticsearch.py | 114 |
1 files changed, 114 insertions, 0 deletions
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py new file mode 100644 index 00000000..ab613a0a --- /dev/null +++ b/python/tests/transform_elasticsearch.py @@ -0,0 +1,114 @@ + +import json +import pytest +from fatcat_tools import * +from fatcat_openapi_client import * +from fixtures import api +from import_journal_metadata import journal_metadata_importer + +from import_crossref import crossref_importer +from import_matched import matched_importer + +def test_basic_elasticsearch_convert(crossref_importer): + with open('tests/files/crossref-works.single.json', 'r') as f: + # not a single line + raw = json.loads(f.read()) + r = crossref_importer.parse_record(raw) + r.state = 'active' + release_to_elasticsearch(r) + +def test_rich_elasticsearch_convert(): + r = ReleaseEntity( + title="something", + release_year=1234, + license_slug="CC-BY-NC", + ext_ids=ReleaseExtIds(), + refs=[ + ReleaseRef(), + ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), + ], + ) + r.state = 'active' + r.container = ContainerEntity( + name="dummy journal", + extra={ + "ia": { + "sim": { + "year_spans": [[1000, 1100]], + }, + }, + "kbart": { + "lockss": { + "year_spans": [[1200, 1300]], + }, + "jstor": { + "year_spans": [[1950, 1960], [1980, 2005]], + }, + }, + "sherpa_romeo": {"color": "blue"}, + "doaj": {"as_of": "2010-02-03"}, + }, + ) + r.files = [FileEntity( + mimetype="application/pdf", + urls=[ + FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), + FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"), + FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), + ], + extra={ + "shadows": {}, + }, + )] + es = release_to_elasticsearch(r) + assert es['release_year'] == r.release_year + assert es['in_ia'] == True + assert es['in_jstor'] == False + assert es['in_ia_sim'] == False + assert es['in_ia'] == True + assert es['in_web'] == True + assert es['in_dweb'] == True + assert es['is_oa'] == True + assert es['is_longtail_oa'] == False + assert es['ref_count'] == 2 + assert es['ref_linked_count'] == 1 + +def test_elasticsearch_release_from_json(): + r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity) + es = release_to_elasticsearch(r) + + assert es['subtitle'] == "Correpondence" + assert es['ident'] == "etodop5banbndg3faecnfm6ozi" + assert es['container_name'] == "BJOG: an International Journal of Obstetrics and Gynaecology" + assert es['first_page'] == "1404" + assert es['issue'] == "11" + assert es['volume'] == "118" + assert es['number'] == None + assert es['in_ia_sim'] == True + assert es['in_kbart'] == True + +def test_elasticsearch_container_transform(journal_metadata_importer): + with open('tests/files/journal_metadata.sample.json', 'r') as f: + raw = json.loads(f.readline()) + c = journal_metadata_importer.parse_record(raw) + c.state = 'active' + es = container_to_elasticsearch(c) + assert es['publisher'] == c.publisher + +def test_elasticsearch_file_transform(matched_importer): + f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity) + + f.state = 'active' + es = file_to_elasticsearch(f) + assert es['sha1'] == f.sha1 + assert es['sha256'] == f.sha256 + assert es['md5'] == f.md5 + assert es['size_bytes'] == f.size + assert es['mimetype'] == f.mimetype + assert es['in_ia'] == True + assert 'publisher' in es['rel'] + + # XXX: implement hosts and domain parsing with urlcanon + #assert 'journals.plos.org' in es['host'] + #assert 'plos.org' in es['domain'] + |