aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/import_dblp.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-02 11:30:03 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-17 23:03:08 -0800
commitc66f9b2d98de88a98d3a1737d415bdab4e89027c (patch)
tree7d89d9b91c1fad8d436c8145c934c60ca074b7b5 /python/tests/import_dblp.py
parenta57de2331caa6f92242dae6197b44926801b429a (diff)
downloadfatcat-c66f9b2d98de88a98d3a1737d415bdab4e89027c.tar.gz
fatcat-c66f9b2d98de88a98d3a1737d415bdab4e89027c.zip
basic test coverage of dblp release importer
Diffstat (limited to 'python/tests/import_dblp.py')
-rw-r--r--python/tests/import_dblp.py72
1 files changed, 72 insertions, 0 deletions
diff --git a/python/tests/import_dblp.py b/python/tests/import_dblp.py
new file mode 100644
index 00000000..fd40eb06
--- /dev/null
+++ b/python/tests/import_dblp.py
@@ -0,0 +1,72 @@
+
+import pytest
+from bs4 import BeautifulSoup
+
+from fatcat_tools.importers import DblpReleaseImporter, Bs4XmlLargeFilePusher
+from fixtures import *
+
+
+@pytest.fixture(scope="function")
+def dblp_importer(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield DblpReleaseImporter(api, issn_file, bezerk_mode=True, lookup_refs=True)
+
+@pytest.fixture(scope="function")
+def dblp_importer_existing(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield DblpReleaseImporter(api, issn_file, bezerk_mode=False, lookup_refs=True)
+
+def test_dblp_importer(dblp_importer):
+ last_index = dblp_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_dblp.xml', 'rb') as f:
+ dblp_importer.bezerk_mode = True
+ counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run()
+ print(counts)
+ assert counts['insert'] == 3
+ assert counts['exists'] == 0
+ assert counts['skip'] == 1
+
+ # fetch most recent editgroup
+ change = dblp_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "dblp" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.DblpReleaseImporter" in eg.extra['agent']
+
+ # check that entity name mangling was fixed on import
+ eg = dblp_importer.api.get_editgroup(eg.editgroup_id)
+ release = dblp_importer.api.get_release(eg.edits.releases[0].ident)
+ assert release.contribs[1].raw_name == "Michael H. Böhlen"
+
+ last_index = dblp_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_dblp.xml', 'rb') as f:
+ dblp_importer.bezerk_mode = False
+ dblp_importer.reset()
+ counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run()
+ print(counts)
+ assert counts['insert'] == 0
+ assert counts['exists'] == 3
+ assert counts['skip'] == 1
+ assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index
+
+def test_dblp_xml_parse(dblp_importer):
+ with open('tests/files/example_dblp_article.xml', 'r') as f:
+ soup = BeautifulSoup(f, "xml")
+ r1 = dblp_importer.parse_record(soup.find_all("article")[0])
+
+ assert r1.title == "Jim Gray, astronomer"
+ assert r1.contribs[0].raw_name == "Alexander S. Szalay"
+ # tested above, in LXML import path
+ #assert r1.contribs[1].raw_name == "Michael H. Bohlen"
+ assert r1.contribs[2].raw_name == "Nicolas Heist"
+ # XXX: assert r1.contribs[2].extra['orcid'] == "0000-0002-4354-9138"
+ assert r1.contribs[3].raw_name == "Jens Lehmann"
+ assert r1.ext_ids.dblp == "journals/cacm/Szalay08"
+ assert r1.ext_ids.doi == "10.1145/1400214.1400231"
+ assert r1.pages == "58-65"
+ assert r1.issue == "11"
+ assert r1.volume == "51"
+ assert r1.release_year == 2008
+ assert r1.extra['container_name'] == "Commun. ACM"
+ assert r1.extra['dblp']['type'] == "article"