1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
import pytest
from bs4 import BeautifulSoup
from fixtures import *
from fatcat_tools.importers import Bs4XmlLargeFilePusher, PubmedImporter
@pytest.fixture(scope="function")
def pubmed_importer(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
yield PubmedImporter(
api,
issn_file,
extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=True,
lookup_refs=True,
)
@pytest.fixture(scope="function")
def pubmed_importer_existing(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
yield PubmedImporter(
api,
issn_file,
extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=False,
lookup_refs=True,
)
def test_pubmed_importer(pubmed_importer):
last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
with open("tests/files/pubmedsample_2019.xml", "r") as f:
pubmed_importer.bezerk_mode = True
counts = Bs4XmlLargeFilePusher(pubmed_importer, f, ["PubmedArticle"]).run()
assert counts["insert"] == 176
assert counts["exists"] == 0
assert counts["skip"] == 0
# fetch most recent editgroup
change = pubmed_importer.api.get_changelog_entry(index=last_index + 1)
eg = change.editgroup
assert eg.description
assert "pubmed" in eg.description.lower()
assert eg.extra["git_rev"]
assert "fatcat_tools.PubmedImporter" in eg.extra["agent"]
last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
with open("tests/files/pubmedsample_2019.xml", "r") as f:
pubmed_importer.bezerk_mode = False
pubmed_importer.reset()
counts = Bs4XmlLargeFilePusher(pubmed_importer, f, ["PubmedArticle"]).run()
assert counts["insert"] == 0
assert counts["exists"] == 176
assert counts["skip"] == 0
assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index
def test_pubmed_xml_parse(pubmed_importer):
with open("tests/files/pubmedsample_2019.xml", "r") as f:
soup = BeautifulSoup(f, "xml")
r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
r2 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[-1])
assert r1.title == "Hospital debt management and cost reimbursement"
assert r1.subtitle is None
assert r1.original_title is None
assert r1.publisher is None
assert r1.release_type == "article-journal"
assert r1.release_stage == "published"
assert r1.license_slug is None
assert r1.ext_ids.doi is None
assert r1.ext_ids.pmid == "973217"
assert r1.language == "en"
assert r1.volume == "3"
assert r1.issue == "1"
assert r1.pages == "69-81"
assert r1.release_date is None # not "1976-12-03", which is medline ingest date
assert r1.release_year == 1976
# matched by ISSN, so shouldn't be in there?
# assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London"
assert len(r1.contribs) == 1
assert r1.contribs[0].raw_name == "F R Blume"
assert r1.contribs[0].given_name == "F R"
assert r1.contribs[0].surname == "Blume"
print(r1.extra)
assert r1.extra["pubmed"]["pub_types"] == ["Journal Article"]
assert not r1.refs
assert (
r2.title
== "Synthesis and Antibacterial Activity of Metal(loid) Nanostructures by Environmental Multi-Metal(loid) Resistant Bacteria and Metal(loid)-Reducing Flavoproteins"
)
assert r2.subtitle is None
assert r2.original_title is None
assert r2.publisher is None
assert r2.release_type == "article-journal"
assert r2.release_stage == "published"
assert r2.license_slug is None
assert r2.ext_ids.doi == "10.3389/fmicb.2018.00959"
assert r2.ext_ids.pmid == "29869640"
assert r2.ext_ids.pmcid == "PMC5962736"
assert r2.language == "en"
assert r2.volume == "9"
assert r2.issue is None
assert r2.pages == "959"
assert str(r2.release_date) == "2018-05-15"
assert r2.release_year == 2018
# matched by ISSN, so shouldn't be in there?
# assert extra['container_name'] == "Frontiers in microbiology"
assert len(r2.contribs) > 3
assert r2.contribs[0].raw_name == "Maximiliano Figueroa"
assert r2.contribs[0].given_name == "Maximiliano"
assert r2.contribs[0].surname == "Figueroa"
assert (
r2.contribs[0].raw_affiliation
== "Laboratorio Microbiología Molecular, Departamento de Biología, Facultad de Química y Biología, Universidad de Santiago de Chile, Santiago, Chile."
)
assert r2.contribs[4].surname == "Muñoz-Villagrán"
assert r2.contribs[7].surname == "Latorre"
assert (
r2.contribs[7].raw_affiliation
== "Mathomics, Centro de Modelamiento Matemático, Universidad de Chile, Beauchef, Santiago, Chile."
)
assert r2.contribs[7].extra["more_affiliations"] == [
"Fondap-Center of Genome Regulation, Facultad de Ciencias, Universidad de Chile, Santiago, Chile.",
"Laboratorio de Bioinformática y Expresión Génica, INTA, Universidad de Chile, Santiago, Chile.",
"Instituto de Ciencias de la Ingeniería, Universidad de O'Higgins, Rancagua, Chile.",
]
assert r2.contribs[-1].raw_name == "Felipe Arenas"
assert r2.abstracts[0].content.startswith(
"Microbes are suitable candidates to recover and decontaminate different environments from soluble metal ions, either via reduction"
)
assert r2.abstracts[0].lang == "en"
print(r2.extra)
assert r2.extra["pubmed"]["pub_types"] == ["Journal Article"]
assert r2.refs[0].extra["unstructured"] == "Microbiology. 2009 Jun;155(Pt 6):1840-6"
assert r2.refs[0].extra["pmid"] == "19383690"
assert len(r2.refs) > 1
def test_pubmed_xml_dates(pubmed_importer):
with open("tests/files/pubmed_31393839.xml", "r") as f:
soup = BeautifulSoup(f, "xml")
r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
assert r1.release_year == 2019
def test_pubmed_xml_parse_refs(pubmed_importer):
"""
Tests the case of multiple nested ReferenceList/Reference objects, instead
of a single ReferenceList with multiple Reference
"""
with open("tests/files/pubmed_19129924.xml", "r") as f:
soup = BeautifulSoup(f, "xml")
r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
assert len(r1.refs) > 1
|