summaryrefslogtreecommitdiffstats
path: root/python/tests/harvest_pubmed.py
blob: 422870f2220498e3a5627aac6a28e9490793f243 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
Test pubmed FTP harvest.
"""

import datetime
import os

import pytest

from fatcat_tools.harvest import *


def test_pubmed_harvest_date(mocker):

    # mock out the harvest state object so it doesn't try to actually connect
    # to Kafka
    mocker.patch("fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka")

    # Mocking a file fetched from FTP, should contain some 'PubmedArticle' elements.
    # $ zcat tests/files/pubmedsample_2019.xml.gz | grep -c '<PubmedArticle>'
    # 176
    file_to_retrieve = os.path.join(os.path.dirname(__file__), "files/pubmedsample_2019.xml.gz")
    ftpretr = mocker.patch("fatcat_tools.harvest.pubmed.ftpretr")
    ftpretr.return_value = file_to_retrieve

    test_date = "2020-02-20"

    # We'll need one entry in the date_file_map.
    generate_date_file_map = mocker.patch("fatcat_tools.harvest.pubmed.generate_date_file_map")
    generate_date_file_map.return_value = {test_date: set(["dummy"])}

    # For cleanup.
    os.remove = mocker.Mock()

    harvester = PubmedFTPWorker(
        kafka_hosts="dummy",
        produce_topic="dummy-produce-topic",
        state_topic="dummy-state-topic",
    )

    harvester.producer = mocker.Mock()
    harvester.date_file_map = generate_date_file_map()
    # Since we mock out the FTP fetch, the concrete date does not matter here.
    harvester.fetch_date(datetime.datetime.strptime(test_date, "%Y-%m-%d"))

    # check that we published the expected number of DOI objects were published
    # to the (mock) kafka topic
    assert harvester.producer.produce.call_count == 176
    assert harvester.producer.flush.call_count == 1
    assert os.remove.call_count == 2


def test_pubmed_harvest_date_no_pmid(mocker):
    # mock out the harvest state object so it doesn't try to actually connect
    # to Kafka
    mocker.patch("fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka")

    file_to_retrieve = os.path.join(
        os.path.dirname(__file__), "files/pubmedsample_no_pmid_2019.xml.gz"
    )
    ftpretr = mocker.patch("fatcat_tools.harvest.pubmed.ftpretr")
    ftpretr.return_value = file_to_retrieve

    test_date = "2020-02-20"

    # We'll need one entry in the date_file_map.
    generate_date_file_map = mocker.patch("fatcat_tools.harvest.pubmed.generate_date_file_map")
    generate_date_file_map.return_value = {test_date: set(["dummy"])}

    harvester = PubmedFTPWorker(
        kafka_hosts="dummy",
        produce_topic="dummy-produce-topic",
        state_topic="dummy-state-topic",
    )

    harvester.producer = mocker.Mock()

    # The file has not PMID, not importable.
    with pytest.raises(ValueError):
        harvester.fetch_date(datetime.datetime.strptime(test_date, "%Y-%m-%d"))