python/tests/import_pubmed.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

import pytest
from bs4 import BeautifulSoup
from fixtures import *

from fatcat_tools.importers import Bs4XmlLargeFilePusher, PubmedImporter


@pytest.fixture(scope="function")
def pubmed_importer(api):
    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
        yield PubmedImporter(
            api,
            issn_file,
            extid_map_file="tests/files/example_map.sqlite3",
            bezerk_mode=True,
            lookup_refs=True,
        )


@pytest.fixture(scope="function")
def pubmed_importer_existing(api):
    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
        yield PubmedImporter(
            api,
            issn_file,
            extid_map_file="tests/files/example_map.sqlite3",
            bezerk_mode=False,
            lookup_refs=True,
        )


def test_pubmed_importer(pubmed_importer):
    last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/pubmedsample_2019.xml", "r") as f:
        pubmed_importer.bezerk_mode = True
        counts = Bs4XmlLargeFilePusher(pubmed_importer, f, ["PubmedArticle"]).run()
    assert counts["insert"] == 176
    assert counts["exists"] == 0
    assert counts["skip"] == 0

    # fetch most recent editgroup
    change = pubmed_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "pubmed" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.PubmedImporter" in eg.extra["agent"]

    last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/pubmedsample_2019.xml", "r") as f:
        pubmed_importer.bezerk_mode = False
        pubmed_importer.reset()
        counts = Bs4XmlLargeFilePusher(pubmed_importer, f, ["PubmedArticle"]).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 176
    assert counts["skip"] == 0
    assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index


def test_pubmed_xml_parse(pubmed_importer):
    with open("tests/files/pubmedsample_2019.xml", "r") as f:
        soup = BeautifulSoup(f, "xml")
        r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
        r2 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[-1])

    assert r1.title == "Hospital debt management and cost reimbursement"
    assert r1.subtitle is None
    assert r1.original_title is None
    assert r1.publisher is None
    assert r1.release_type == "article-journal"
    assert r1.release_stage == "published"
    assert r1.license_slug is None
    assert r1.ext_ids.doi is None
    assert r1.ext_ids.pmid == "973217"
    assert r1.language == "en"
    assert r1.volume == "3"
    assert r1.issue == "1"
    assert r1.pages == "69-81"
    assert r1.release_date is None  # not "1976-12-03", which is medline ingest date
    assert r1.release_year == 1976
    # matched by ISSN, so shouldn't be in there?
    # assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London"
    assert len(r1.contribs) == 1

    assert r1.contribs[0].raw_name == "F R Blume"
    assert r1.contribs[0].given_name == "F R"
    assert r1.contribs[0].surname == "Blume"

    print(r1.extra)
    assert r1.extra["pubmed"]["pub_types"] == ["Journal Article"]
    assert not r1.refs

    assert (
        r2.title
        == "Synthesis and Antibacterial Activity of Metal(loid) Nanostructures by Environmental Multi-Metal(loid) Resistant Bacteria and Metal(loid)-Reducing Flavoproteins"
    )
    assert r2.subtitle is None
    assert r2.original_title is None
    assert r2.publisher is None
    assert r2.release_type == "article-journal"
    assert r2.release_stage == "published"
    assert r2.license_slug is None
    assert r2.ext_ids.doi == "10.3389/fmicb.2018.00959"
    assert r2.ext_ids.pmid == "29869640"
    assert r2.ext_ids.pmcid == "PMC5962736"
    assert r2.language == "en"
    assert r2.volume == "9"
    assert r2.issue is None
    assert r2.pages == "959"
    assert str(r2.release_date) == "2018-05-15"
    assert r2.release_year == 2018
    # matched by ISSN, so shouldn't be in there?
    # assert extra['container_name'] == "Frontiers in microbiology"

    assert len(r2.contribs) > 3
    assert r2.contribs[0].raw_name == "Maximiliano Figueroa"
    assert r2.contribs[0].given_name == "Maximiliano"
    assert r2.contribs[0].surname == "Figueroa"
    assert (
        r2.contribs[0].raw_affiliation
        == "Laboratorio Microbiología Molecular, Departamento de Biología, Facultad de Química y Biología, Universidad de Santiago de Chile, Santiago, Chile."
    )
    assert r2.contribs[4].surname == "Muñoz-Villagrán"
    assert r2.contribs[7].surname == "Latorre"
    assert (
        r2.contribs[7].raw_affiliation
        == "Mathomics, Centro de Modelamiento Matemático, Universidad de Chile, Beauchef, Santiago, Chile."
    )
    assert r2.contribs[7].extra["more_affiliations"] == [
        "Fondap-Center of Genome Regulation, Facultad de Ciencias, Universidad de Chile, Santiago, Chile.",
        "Laboratorio de Bioinformática y Expresión Génica, INTA, Universidad de Chile, Santiago, Chile.",
        "Instituto de Ciencias de la Ingeniería, Universidad de O'Higgins, Rancagua, Chile.",
    ]
    assert r2.contribs[-1].raw_name == "Felipe Arenas"

    assert r2.abstracts[0].content.startswith(
        "Microbes are suitable candidates to recover and decontaminate different environments from soluble metal ions, either via reduction"
    )
    assert r2.abstracts[0].lang == "en"

    print(r2.extra)
    assert r2.extra["pubmed"]["pub_types"] == ["Journal Article"]

    assert r2.refs[0].extra["unstructured"] == "Microbiology. 2009 Jun;155(Pt 6):1840-6"
    assert r2.refs[0].extra["pmid"] == "19383690"
    assert len(r2.refs) > 1


def test_pubmed_xml_dates(pubmed_importer):
    with open("tests/files/pubmed_31393839.xml", "r") as f:
        soup = BeautifulSoup(f, "xml")
        r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])

    assert r1.release_year == 2019


def test_pubmed_xml_parse_refs(pubmed_importer):
    """
    Tests the case of multiple nested ReferenceList/Reference objects, instead
    of a single ReferenceList with multiple Reference
    """
    with open("tests/files/pubmed_19129924.xml", "r") as f:
        soup = BeautifulSoup(f, "xml")
        r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])

    assert len(r1.refs) > 1