1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
|
import gzip
import json
import pytest
from fixtures import api
from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
@pytest.fixture(scope="function")
def crossref_importer(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
yield CrossrefImporter(api, issn_file, bezerk_mode=True)
@pytest.fixture(scope="function")
def crossref_importer_existing(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
yield CrossrefImporter(api, issn_file, bezerk_mode=False)
@pytest.mark.skip(
reason="slow/huge crossref import is a corner-case and slows tests significantly"
)
def test_crossref_importer_huge(crossref_importer):
last_index = crossref_importer.api.get_changelog(limit=1)[0].index
with gzip.open("tests/files/huge_crossref_doi.json.gz", "rt") as f:
crossref_importer.bezerk_mode = True
line = f.readline()
mega_blob = [line for i in range(95)]
counts = JsonLinePusher(crossref_importer, mega_blob).run()
assert counts["insert"] == 95
change = crossref_importer.api.get_changelog_entry(index=last_index + 1)
release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident)
assert len(release.contribs) == 1014
def test_crossref_importer(crossref_importer):
last_index = crossref_importer.api.get_changelog(limit=1)[0].index
with open("tests/files/crossref-works.2018-01-21.badsample.json", "r") as f:
crossref_importer.bezerk_mode = True
counts = JsonLinePusher(crossref_importer, f).run()
assert counts["insert"] == 14
assert counts["exists"] == 0
assert counts["skip"] == 0
# fetch most recent editgroup
change = crossref_importer.api.get_changelog_entry(index=last_index + 1)
eg = change.editgroup
assert eg.description
assert "crossref" in eg.description.lower()
assert eg.extra["git_rev"]
assert "fatcat_tools.CrossrefImporter" in eg.extra["agent"]
last_index = crossref_importer.api.get_changelog(limit=1)[0].index
with open("tests/files/crossref-works.2018-01-21.badsample.json", "r") as f:
crossref_importer.bezerk_mode = False
crossref_importer.reset()
counts = JsonLinePusher(crossref_importer, f).run()
assert counts["insert"] == 0
assert counts["exists"] == 14
assert counts["skip"] == 0
assert last_index == crossref_importer.api.get_changelog(limit=1)[0].index
def test_crossref_mappings(crossref_importer):
assert crossref_importer.map_release_type("journal-article") == "article-journal"
assert crossref_importer.map_release_type("asdf") is None
assert crossref_importer.map_release_type("book-series") is None
assert crossref_importer.map_release_type("standard") == "standard"
def test_crossref_importer_create(crossref_importer):
crossref_importer.create_containers = True
with open("tests/files/crossref-works.2018-01-21.badsample.json", "r") as f:
JsonLinePusher(crossref_importer, f).run()
def test_crossref_dict_parse(crossref_importer):
with open("tests/files/crossref-works.single.json", "r") as f:
# not a single line
raw = json.loads(f.read())
r = crossref_importer.parse_record(raw)
# ensure the API server is ok with format
JsonLinePusher(crossref_importer, [json.dumps(raw)]).run()
print(r.extra)
assert (
r.title
== "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators"
)
assert r.publisher == "Wiley-Blackwell"
assert r.release_type == "article-journal"
assert r.release_stage == "published"
assert r.license_slug == "CC-BY-NC-ND"
assert r.original_title == "Renormalized perturbation theory auf deutsch"
assert r.ext_ids.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t"
assert r.ext_ids.isbn13 == "978-3-16-148410-0"
assert r.language == "fr"
assert r.subtitle is None
assert "subtitle" not in r.extra
assert "subtitle" not in r.extra["crossref"]
assert "funder" not in r.extra
assert "funder" not in r.extra["crossref"]
# matched by ISSN, so shouldn't be in there
# assert extra['container_name'] == "International Journal of Quantum Chemistry"
assert r.extra["aliases"] == ["some other title"]
assert r.extra["crossref"]["archive"] == ["Portico", "LOCKSS"]
assert len(r.contribs) == 6
assert r.contribs[0].raw_name == "Marcelo D. Radicioni"
assert r.contribs[0].given_name == "Marcelo D."
assert r.contribs[0].surname == "Radicioni"
assert r.contribs[0].index == 0
assert r.contribs[0].extra["seq"] == "first"
assert r.contribs[1].raw_affiliation == "Some University"
assert r.contribs[1].extra["more_affiliations"] == ["Some Department"]
assert r.contribs[1].role == "author"
assert r.contribs[4].role == "editor"
assert r.contribs[4].index is None
assert r.contribs[4].extra is None
assert r.contribs[5].role == "translator"
assert r.contribs[5].index is None
assert len(r.refs) == 25
assert r.refs[0].key == "BIB1"
assert r.refs[0].year == 1972
assert r.refs[0].locator == "1734"
assert r.refs[0].container_name == "J. Chem. Phys."
assert r.refs[0].extra == {
"volume": "57",
"authors": ["Swenson"],
"doi": "10.1063/1.1678462",
"medium": "DVD",
}
assert r.refs[2].key == "BIB3"
assert r.refs[2].extra.get("author") is None
assert (
r.refs[2].container_name == "Hypervirial Theorem's, Lecture Notes in Chemistry <3"
)
assert (
r.refs[3].container_name
== "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry"
)
def test_crossref_subtitle(crossref_importer):
"""
Tests new subtitle field, explicitly
"""
with open("tests/files/crossref-works.single.json", "r") as f:
# not a single line
raw = json.loads(f.read())
raw["subtitle"] = ["some bogus subtitle", "blah"]
r = crossref_importer.parse_record(raw)
# ensure the API server is ok with format
JsonLinePusher(crossref_importer, [json.dumps(raw)]).run()
print(r.extra)
assert (
r.title
== "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators"
)
assert r.subtitle == "some bogus subtitle"
assert "subtitle" not in r.extra
assert "subtitle" not in r.extra["crossref"]
def test_stateful_checking(crossref_importer_existing):
with open("tests/files/crossref-works.single.json", "r") as f:
# not a single line, a whole document
raw = f.read()
# might not exist yet...
crossref_importer_existing.push_record(json.loads(raw))
crossref_importer_existing.finish()
# make sure we wouldn't insert again
entity = crossref_importer_existing.parse_record(json.loads(raw))
assert crossref_importer_existing.try_update(entity) is False
|