python/tests/import_dblp.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128


import io
import pytest
from bs4 import BeautifulSoup

from fatcat_tools.importers import DblpReleaseImporter, DblpContainerImporter, Bs4XmlLargeFilePusher, JsonLinePusher
from fixtures import *


@pytest.fixture(scope="function")
def dblp_importer(api):
    with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file:
        yield DblpReleaseImporter(api, tsv_file, bezerk_mode=True)

@pytest.fixture(scope="function")
def dblp_container_importer(api):
    with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file:
        with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
            yield DblpContainerImporter(api, issn_file, tsv_file, io.StringIO(), bezerk_mode=True)

def test_dblp_importer(dblp_importer):
    last_index = dblp_importer.api.get_changelog(limit=1)[0].index
    with open('tests/files/example_dblp.xml', 'rb') as f:
        dblp_importer.bezerk_mode = True
        counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run()
    #print(counts)
    assert counts['insert'] == 3
    assert counts['exists'] == 0
    assert counts['skip'] == 1

    # fetch most recent editgroup
    change = dblp_importer.api.get_changelog_entry(index=last_index+1)
    eg = change.editgroup
    assert eg.description
    assert "dblp" in eg.description.lower()
    assert eg.extra['git_rev']
    assert "fatcat_tools.DblpReleaseImporter" in eg.extra['agent']

    # check that entity name mangling was fixed on import
    eg = dblp_importer.api.get_editgroup(eg.editgroup_id)
    release = dblp_importer.api.get_release(eg.edits.releases[0].ident)
    for r_edit in eg.edits.releases:
        release = dblp_importer.api.get_release(r_edit.ident)
        #print(release.ext_ids.dblp)
        if release.ext_ids.dblp == "conf/er/Norrie08":
            break
    assert release.ext_ids.dblp == "conf/er/Norrie08"
    assert release.contribs[0].raw_name == "Moira C. Norrie"
    assert release.contribs[1].raw_name == "Michael H. Böhlen"

    last_index = dblp_importer.api.get_changelog(limit=1)[0].index
    with open('tests/files/example_dblp.xml', 'rb') as f:
        dblp_importer.bezerk_mode = False
        dblp_importer.reset()
        counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run()
    #print(counts)
    assert counts['insert'] == 0
    assert counts['exists'] == 3
    assert counts['skip'] == 1
    assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index

def test_dblp_container_importer(dblp_container_importer):
    last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index
    output_tsv_map = io.StringIO()
    with open('tests/files/example_dblp_containers.json', 'r') as f:
        dblp_container_importer.bezerk_mode = True
        dblp_container_importer.dblp_container_map_output = output_tsv_map
        counts = JsonLinePusher(dblp_container_importer, f).run()
    assert counts['insert'] == 10
    assert counts['exists'] == 0
    assert counts['skip'] == 0

    # fetch most recent editgroup
    change = dblp_container_importer.api.get_changelog_entry(index=last_index+1)
    eg = change.editgroup
    assert eg.description
    assert "dblp" in eg.description.lower()
    assert eg.extra['git_rev']
    assert "fatcat_tools.DblpContainerImporter" in eg.extra['agent']

    # check that entity name mangling was fixed on import
    eg = dblp_container_importer.api.get_editgroup(eg.editgroup_id)
    for c_edit in eg.edits.containers:
        container = dblp_container_importer.api.get_container(c_edit.ident)
        if container.issnl == "1877-3273":
            break
    assert container.name == "Atlantis Thinking Machines"
    assert container.issnl == "1877-3273"
    assert container.container_type == "book-series"
    assert container.extra['dblp']['prefix'] == "series/atlantis"
    assert container.extra['urls'] == ["http://link.springer.com/bookseries/10077"]

    last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index
    output_tsv_map.seek(0)
    #print(output_tsv_map.read())
    #output_tsv_map.seek(0)
    with open('tests/files/example_dblp_containers.json', 'r') as f:
        dblp_container_importer.reset()
        dblp_container_importer.bezerk_mode = False
        dblp_container_importer.dblp_container_map_output = io.StringIO()
        dblp_container_importer.read_dblp_container_map_file(output_tsv_map)
        counts = JsonLinePusher(dblp_container_importer, f).run()
    print(counts)
    assert counts['insert'] == 0
    assert counts['exists'] == 10
    assert counts['skip'] == 0
    assert last_index == dblp_container_importer.api.get_changelog(limit=1)[0].index

def test_dblp_xml_parse(dblp_importer):
    with open('tests/files/example_dblp_article.xml', 'r') as f:
        soup = BeautifulSoup(f, "xml")
        r1 = dblp_importer.parse_record(soup.find_all("article")[0])

    assert r1.title == "Jim Gray, astronomer"
    assert r1.contribs[0].raw_name == "Alexander S. Szalay"
    # tested above, in LXML import path
    #assert r1.contribs[1].raw_name == "Michael H. Bohlen"
    assert r1.contribs[2].raw_name == "Nicolas Heist"
    # XXX: assert r1.contribs[2].extra['orcid'] == "0000-0002-4354-9138"
    assert r1.contribs[3].raw_name == "Jens Lehmann"
    assert r1.ext_ids.dblp == "journals/cacm/Szalay08"
    assert r1.ext_ids.doi == "10.1145/1400214.1400231"
    assert r1.pages == "58-65"
    assert r1.issue == "11"
    assert r1.volume == "51"
    assert r1.release_year == 2008
    #assert r1.extra['container_name'] == "Commun. ACM"
    assert r1.extra['dblp']['type'] == "article"