summaryrefslogtreecommitdiffstats
path: root/python/tests/import_dblp.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-17 01:55:30 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-17 23:03:08 -0800
commit58ff361eb481bee9d2ef7249f48f94729d2a830d (patch)
tree9cbcbcf1c7eafa958c81d18fcf5b6a5d27f2f42d /python/tests/import_dblp.py
parentb2f53077bd05a536b5fdb551755a559b653421d3 (diff)
downloadfatcat-58ff361eb481bee9d2ef7249f48f94729d2a830d.tar.gz
fatcat-58ff361eb481bee9d2ef7249f48f94729d2a830d.zip
very simple dblp container importer
Diffstat (limited to 'python/tests/import_dblp.py')
-rw-r--r--python/tests/import_dblp.py61
1 files changed, 56 insertions, 5 deletions
diff --git a/python/tests/import_dblp.py b/python/tests/import_dblp.py
index fd40eb06..5041b509 100644
--- a/python/tests/import_dblp.py
+++ b/python/tests/import_dblp.py
@@ -1,8 +1,9 @@
+import io
import pytest
from bs4 import BeautifulSoup
-from fatcat_tools.importers import DblpReleaseImporter, Bs4XmlLargeFilePusher
+from fatcat_tools.importers import DblpReleaseImporter, DblpContainerImporter, Bs4XmlLargeFilePusher, JsonLinePusher
from fixtures import *
@@ -13,15 +14,21 @@ def dblp_importer(api):
@pytest.fixture(scope="function")
def dblp_importer_existing(api):
- with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield DblpReleaseImporter(api, issn_file, bezerk_mode=False, lookup_refs=True)
+ with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file:
+ yield DblpReleaseImporter(api, tsv_file, bezerk_mode=False)
+
+@pytest.fixture(scope="function")
+def dblp_container_importer(api):
+ with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file:
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield DblpContainerImporter(api, issn_file, tsv_file, io.StringIO(), bezerk_mode=True)
def test_dblp_importer(dblp_importer):
last_index = dblp_importer.api.get_changelog(limit=1)[0].index
with open('tests/files/example_dblp.xml', 'rb') as f:
dblp_importer.bezerk_mode = True
counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run()
- print(counts)
+ #print(counts)
assert counts['insert'] == 3
assert counts['exists'] == 0
assert counts['skip'] == 1
@@ -44,12 +51,56 @@ def test_dblp_importer(dblp_importer):
dblp_importer.bezerk_mode = False
dblp_importer.reset()
counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run()
- print(counts)
+ #print(counts)
assert counts['insert'] == 0
assert counts['exists'] == 3
assert counts['skip'] == 1
assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index
+def test_dblp_container_importer(dblp_container_importer):
+ last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index
+ output_tsv_map = io.StringIO()
+ with open('tests/files/example_dblp_containers.json', 'r') as f:
+ dblp_container_importer.bezerk_mode = True
+ dblp_container_importer.dblp_container_map_output = output_tsv_map
+ counts = JsonLinePusher(dblp_container_importer, f).run()
+ assert counts['insert'] == 10
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = dblp_container_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "dblp" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.DblpContainerImporter" in eg.extra['agent']
+
+ # check that entity name mangling was fixed on import
+ eg = dblp_container_importer.api.get_editgroup(eg.editgroup_id)
+ container = dblp_container_importer.api.get_container(eg.edits.containers[0].ident)
+ assert container.name == "Atlantis Thinking Machines"
+ assert container.issnl == "1877-3273"
+ assert container.container_type == "book-series"
+ assert container.extra['dblp']['prefix'] == "series/atlantis"
+ assert container.extra['urls'] == ["http://link.springer.com/bookseries/10077"]
+
+ last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index
+ output_tsv_map.seek(0)
+ print(output_tsv_map.read())
+ output_tsv_map.seek(0)
+ with open('tests/files/example_dblp_containers.json', 'r') as f:
+ dblp_container_importer.reset()
+ dblp_container_importer.bezerk_mode = False
+ dblp_container_importer.dblp_container_map_output = io.StringIO()
+ dblp_container_importer.read_dblp_container_map_file(output_tsv_map)
+ counts = JsonLinePusher(dblp_container_importer, f).run()
+ print(counts)
+ assert counts['insert'] == 0
+ assert counts['exists'] == 10
+ assert counts['skip'] == 0
+ assert last_index == dblp_container_importer.api.get_changelog(limit=1)[0].index
+
def test_dblp_xml_parse(dblp_importer):
with open('tests/files/example_dblp_article.xml', 'r') as f:
soup = BeautifulSoup(f, "xml")