aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/import_doaj.py
blob: 9c4ba552b5c94ed9e761fbaad3900062534676bf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import datetime
import json

import elasticsearch
import fatcat_openapi_client
import pytest
from fixtures import *

from fatcat_tools.importers import DoajArticleImporter, JsonLinePusher
from fatcat_tools.transforms import entity_to_dict


@pytest.fixture(scope="function")
def doaj_importer(api, mocker):
    es_client = elasticsearch.Elasticsearch("mockbackend")
    mocker.patch("elasticsearch.connection.Urllib3HttpConnection.perform_request")
    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
        yield DoajArticleImporter(
            api,
            issn_file,
            bezerk_mode=True,
            es_client=es_client,
        )


def test_doaj_importer(doaj_importer):
    last_index = doaj_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.bezerk_mode = True
        doaj_importer.do_fuzzy_match = False
        counts = JsonLinePusher(doaj_importer, f).run()
    assert counts["insert"] == 5
    assert counts["exists"] == 0
    assert counts["skip"] == 0
    success_changelog = doaj_importer.api.get_changelog(limit=1)[0]
    assert last_index + 1 == success_changelog.index

    # fetch most recent editgroup
    change = doaj_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "doaj" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.DoajArticleImporter" in eg.extra["agent"]

    last_index = doaj_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.bezerk_mode = False
        doaj_importer.reset()
        counts = JsonLinePusher(doaj_importer, f).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 5
    assert counts["skip"] == 0
    assert last_index == doaj_importer.api.get_changelog(limit=1)[0].index

    # cleanup file entities (so other import tests work)
    success_editgroup = doaj_importer.api.get_editgroup(success_changelog.editgroup_id)
    eg = quick_eg(doaj_importer.api)
    for release_edit in success_editgroup.edits.releases:
        doaj_importer.api.delete_release(eg.editgroup_id, release_edit.ident)
    doaj_importer.api.accept_editgroup(eg.editgroup_id)


def test_doaj_importer_existing_doi(doaj_importer):
    """
    One of the DOAJ test entities has a dummy DOI (10.123/abc); this test
    ensures that it isn't clobbered, an then that it gets updated.
    """
    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.reset()
        doaj_importer.bezerk_mode = False
        doaj_importer.do_updates = False
        doaj_importer.do_fuzzy_match = False
        counts = JsonLinePusher(doaj_importer, f).run()
    print(counts)
    assert counts["insert"] == 4
    assert counts["exists"] == 1
    assert counts["skip"] == 0
    success_changelog = doaj_importer.api.get_changelog(limit=1)[0]
    success_editgroup = doaj_importer.api.get_editgroup(success_changelog.editgroup_id)

    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.reset()
        doaj_importer.bezerk_mode = False
        doaj_importer.do_updates = True
        doaj_importer.do_fuzzy_match = False
        counts = JsonLinePusher(doaj_importer, f).run()
    print(counts)
    assert counts["insert"] == 0
    assert counts["exists"] == 4
    assert counts["update"] == 1
    update_changelog = doaj_importer.api.get_changelog(limit=1)[0]
    update_editgroup = doaj_importer.api.get_editgroup(update_changelog.editgroup_id)

    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.reset()
        doaj_importer.bezerk_mode = False
        doaj_importer.do_updates = True
        doaj_importer.do_fuzzy_match = False
        counts = JsonLinePusher(doaj_importer, f).run()
    print(counts)
    assert counts["insert"] == 0
    assert counts["exists"] == 5
    assert counts["update"] == 0

    # cleanup file entities (so other import tests work)
    eg = quick_eg(doaj_importer.api)
    for release_edit in success_editgroup.edits.releases:
        doaj_importer.api.delete_release(eg.editgroup_id, release_edit.ident)
    for release_edit in update_editgroup.edits.releases:
        print(release_edit)
        doaj_importer.api.update_release(
            eg.editgroup_id,
            release_edit.ident,
            ReleaseEntity(
                revision=release_edit.prev_revision,
                ext_ids=ReleaseExtIds(),
            ),
        )
    doaj_importer.api.accept_editgroup(eg.editgroup_id)


def test_doaj_dict_parse(doaj_importer):
    with open("tests/files/example_doaj_articles.json", "r") as f:
        raw = json.loads(f.readline())
        r = doaj_importer.parse_record(raw)

        assert (
            r.title
            == "Effect of hydrogen on tensile properties and fracture behavior of PH 13-8 Mo steel"
        )
        assert r.publisher == "Elsevier"
        assert r.release_type == "article-journal"
        assert r.release_stage == "published"
        assert r.license_slug == "cc-by-nc-nd"
        assert r.original_title is None
        assert r.ext_ids.doi == "10.1016/j.matdes.2016.06.110"
        assert r.ext_ids.doaj == "e58f08a11ecb495ead55a44ad4f89808"
        assert r.subtitle is None
        assert r.release_date is None
        assert r.release_year == 2016
        assert r.volume == "108"
        assert r.number is None
        assert r.pages == "608-617"
        assert r.version is None
        assert r.language == "en"
        # matched by ISSN, so wouldn't be defined normally
        assert r.extra["container_name"] == "Materials & Design"
        assert len(r.abstracts) == 1
        assert len(r.abstracts[0].content) == 1033
        assert len(r.contribs) == 5
        assert r.contribs[0].raw_name == "Xinfeng Li"
        assert r.contribs[0].given_name is None
        assert r.contribs[0].surname is None
        assert not r.refs

        # print(r.extra)
        assert r.extra["release_month"] == 10
        assert r.extra["country"] == "gb"