diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-17 01:55:30 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-17 23:03:08 -0800 | 
| commit | 58ff361eb481bee9d2ef7249f48f94729d2a830d (patch) | |
| tree | 9cbcbcf1c7eafa958c81d18fcf5b6a5d27f2f42d /python/tests | |
| parent | b2f53077bd05a536b5fdb551755a559b653421d3 (diff) | |
| download | fatcat-58ff361eb481bee9d2ef7249f48f94729d2a830d.tar.gz fatcat-58ff361eb481bee9d2ef7249f48f94729d2a830d.zip | |
very simple dblp container importer
Diffstat (limited to 'python/tests')
| -rw-r--r-- | python/tests/files/ISSN-to-ISSN-L.snip.txt | 1 | ||||
| -rw-r--r-- | python/tests/files/dblp_container_map.tsv | 10 | ||||
| -rw-r--r-- | python/tests/files/example_dblp_containers.json | 10 | ||||
| -rw-r--r-- | python/tests/import_dblp.py | 61 | 
4 files changed, 77 insertions, 5 deletions
| diff --git a/python/tests/files/ISSN-to-ISSN-L.snip.txt b/python/tests/files/ISSN-to-ISSN-L.snip.txt index c97de279..2569c443 100644 --- a/python/tests/files/ISSN-to-ISSN-L.snip.txt +++ b/python/tests/files/ISSN-to-ISSN-L.snip.txt @@ -8,3 +8,4 @@ ISSN	ISSN-L  0000-0094	0002-0094  0000-0108	0002-0108  0000-0140	0002-0140 +1877-3273	1877-3273 diff --git a/python/tests/files/dblp_container_map.tsv b/python/tests/files/dblp_container_map.tsv new file mode 100644 index 00000000..5f3dbacd --- /dev/null +++ b/python/tests/files/dblp_container_map.tsv @@ -0,0 +1,10 @@ +dblp_prefix	container_id +series/atlantis	7xuzjpmxqnaq7enf3qc45diflu +series/apc	ktlimnolmvbwbhthlj66zuooea +series/swb	s2qx5lvrdngvrk6y2arrem3qom +series/sbece	7g52v6peqrek5l7zmstisnpkdm +series/wssris	iszwbitlzfau5k77dpqifz5tf4 +series/lnm	r436k67l3fdm5cgs5hqif5e7hi +conf/focs	aaaaaaaaaaaaaeiraaaaaaaaai +conf/er	aaaaaaaaaaaaaeiraaaaaaaaai +journals/cacm	aaaaaaaaaaaaaeiraaaaaaaaai diff --git a/python/tests/files/example_dblp_containers.json b/python/tests/files/example_dblp_containers.json new file mode 100644 index 00000000..c5137604 --- /dev/null +++ b/python/tests/files/example_dblp_containers.json @@ -0,0 +1,10 @@ +{"homepage_url": "http://link.springer.com/bookseries/10077", "issns": ["1877-3273"], "key": "series/atlantis", "title": "Atlantis Thinking Machines"} +{"homepage_url": "http://ebooks.iospress.nl/bookseries/advances-in-parallel-computing", "issns": ["0927-5452", "1879-808X"], "key": "series/apc", "title": "Advances in Parallel Computing"} +{"homepage_url": "http://link.springer.com/bookseries/7056", "issns": ["1559-7474"], "key": "series/swb", "title": "Semantic Web and Beyond: Computing for Human Experience"} +{"homepage_url": "http://link.springer.com/bookseries/10059", "issns": ["2191-8112"], "key": "series/sbece", "title": "Springer Briefs in Electrical and Computer Engineering"} +{"homepage_url": "http://www.worldscientific.com/series/wssris", "issns": [], "key": "series/wssris", "title": "World Scientific Series in Robotics and Intelligent Systems"} +{"acronym": "LNM", "homepage_url": "http://link.springer.com/bookseries/304", "issns": ["0075-8434"], "key": "series/lnm", "title": "Lecture Notes in Mathematics"} +{"homepage_url": "https://www.usenix.org/lisa", "issns": [], "key": "series/sage", "title": "Short Topics in System Administration"} +{"homepage_url": "http://link.springer.com/bookseries/4205", "issns": ["2191-6586", "2191-6594"], "key": "series/acvpr", "title": "Advances in Computer Vision and Pattern Recognition"} +{"acronym": "STAR", "homepage_url": "http://link.springer.com/bookseries/5208", "issns": ["1610-7438", "1610-742X"], "key": "series/star", "title": "Springer Tracts in Advanced Robotics"} +{"acronym": "FAIA", "homepage_url": "http://ebooks.iospress.nl/bookseries/frontiers-in-artificial-intelligence-and-applications", "issns": ["0922-6389", "1879-8314"], "key": "series/faia", "title": "Frontiers in Artificial Intelligence and Applications"} diff --git a/python/tests/import_dblp.py b/python/tests/import_dblp.py index fd40eb06..5041b509 100644 --- a/python/tests/import_dblp.py +++ b/python/tests/import_dblp.py @@ -1,8 +1,9 @@ +import io  import pytest  from bs4 import BeautifulSoup -from fatcat_tools.importers import DblpReleaseImporter, Bs4XmlLargeFilePusher +from fatcat_tools.importers import DblpReleaseImporter, DblpContainerImporter, Bs4XmlLargeFilePusher, JsonLinePusher  from fixtures import * @@ -13,15 +14,21 @@ def dblp_importer(api):  @pytest.fixture(scope="function")  def dblp_importer_existing(api): -    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: -        yield DblpReleaseImporter(api, issn_file, bezerk_mode=False, lookup_refs=True) +    with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file: +        yield DblpReleaseImporter(api, tsv_file, bezerk_mode=False) + +@pytest.fixture(scope="function") +def dblp_container_importer(api): +    with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file: +        with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: +            yield DblpContainerImporter(api, issn_file, tsv_file, io.StringIO(), bezerk_mode=True)  def test_dblp_importer(dblp_importer):      last_index = dblp_importer.api.get_changelog(limit=1)[0].index      with open('tests/files/example_dblp.xml', 'rb') as f:          dblp_importer.bezerk_mode = True          counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run() -    print(counts) +    #print(counts)      assert counts['insert'] == 3      assert counts['exists'] == 0      assert counts['skip'] == 1 @@ -44,12 +51,56 @@ def test_dblp_importer(dblp_importer):          dblp_importer.bezerk_mode = False          dblp_importer.reset()          counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run() -    print(counts) +    #print(counts)      assert counts['insert'] == 0      assert counts['exists'] == 3      assert counts['skip'] == 1      assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index +def test_dblp_container_importer(dblp_container_importer): +    last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index +    output_tsv_map = io.StringIO() +    with open('tests/files/example_dblp_containers.json', 'r') as f: +        dblp_container_importer.bezerk_mode = True +        dblp_container_importer.dblp_container_map_output = output_tsv_map +        counts = JsonLinePusher(dblp_container_importer, f).run() +    assert counts['insert'] == 10 +    assert counts['exists'] == 0 +    assert counts['skip'] == 0 + +    # fetch most recent editgroup +    change = dblp_container_importer.api.get_changelog_entry(index=last_index+1) +    eg = change.editgroup +    assert eg.description +    assert "dblp" in eg.description.lower() +    assert eg.extra['git_rev'] +    assert "fatcat_tools.DblpContainerImporter" in eg.extra['agent'] + +    # check that entity name mangling was fixed on import +    eg = dblp_container_importer.api.get_editgroup(eg.editgroup_id) +    container = dblp_container_importer.api.get_container(eg.edits.containers[0].ident) +    assert container.name == "Atlantis Thinking Machines" +    assert container.issnl == "1877-3273" +    assert container.container_type == "book-series" +    assert container.extra['dblp']['prefix'] == "series/atlantis" +    assert container.extra['urls'] == ["http://link.springer.com/bookseries/10077"] + +    last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index +    output_tsv_map.seek(0) +    print(output_tsv_map.read()) +    output_tsv_map.seek(0) +    with open('tests/files/example_dblp_containers.json', 'r') as f: +        dblp_container_importer.reset() +        dblp_container_importer.bezerk_mode = False +        dblp_container_importer.dblp_container_map_output = io.StringIO() +        dblp_container_importer.read_dblp_container_map_file(output_tsv_map) +        counts = JsonLinePusher(dblp_container_importer, f).run() +    print(counts) +    assert counts['insert'] == 0 +    assert counts['exists'] == 10 +    assert counts['skip'] == 0 +    assert last_index == dblp_container_importer.api.get_changelog(limit=1)[0].index +  def test_dblp_xml_parse(dblp_importer):      with open('tests/files/example_dblp_article.xml', 'r') as f:          soup = BeautifulSoup(f, "xml") | 
