1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
"""
Test pubmed FTP harvest.
"""
import os
import datetime
import pytest
from fatcat_tools.harvest import *
def test_pubmed_harvest_date(mocker):
# mock out the harvest state object so it doesn't try to actually connect
# to Kafka
mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
# Mocking a file fetched from FTP, should contain some 'PubmedArticle' elements.
# $ zcat tests/files/pubmedsample_2019.xml.gz | grep -c '<PubmedArticle>'
# 176
file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_2019.xml.gz')
ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr')
ftpretr.return_value = file_to_retrieve
test_date = '2020-02-20'
# We'll need one entry in the date_file_map.
generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map')
generate_date_file_map.return_value = {test_date: set(['dummy'])}
# For cleanup.
os.remove = mocker.Mock()
harvester = PubmedFTPWorker(
kafka_hosts="dummy",
produce_topic="dummy-produce-topic",
state_topic="dummy-state-topic",
)
harvester.producer = mocker.Mock()
harvester.date_file_map = generate_date_file_map()
# Since we mock out the FTP fetch, the concrete date does not matter here.
harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d'))
# check that we published the expected number of DOI objects were published
# to the (mock) kafka topic
assert harvester.producer.produce.call_count == 176
assert harvester.producer.flush.call_count == 1
assert os.remove.call_count == 2
def test_pubmed_harvest_date_no_pmid(mocker):
# mock out the harvest state object so it doesn't try to actually connect
# to Kafka
mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_no_pmid_2019.xml.gz')
ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr')
ftpretr.return_value = file_to_retrieve
test_date = '2020-02-20'
# We'll need one entry in the date_file_map.
generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map')
generate_date_file_map.return_value = {test_date: set(['dummy'])}
harvester = PubmedFTPWorker(
kafka_hosts="dummy",
produce_topic="dummy-produce-topic",
state_topic="dummy-state-topic",
)
harvester.producer = mocker.Mock()
# The file has not PMID, not importable.
with pytest.raises(ValueError):
harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d'))
|