diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-02-19 02:28:11 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-02-22 17:44:38 +0100 | 
| commit | 376053a479a8d683fc5e099d0b0b3cb76c026d16 (patch) | |
| tree | 0ef3343b258ad9d485aa8a558e1f505832c5d415 /python/tests/harvest_pubmed.py | |
| parent | 456f318b5ef904786aabf2411d2d244cd38f25b1 (diff) | |
| download | fatcat-376053a479a8d683fc5e099d0b0b3cb76c026d16.tar.gz fatcat-376053a479a8d683fc5e099d0b0b3cb76c026d16.zip | |
more pubmed adjustments
* regenerate map in continuous mode
* add tests
Diffstat (limited to 'python/tests/harvest_pubmed.py')
| -rw-r--r-- | python/tests/harvest_pubmed.py | 78 | 
1 files changed, 78 insertions, 0 deletions
| diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py new file mode 100644 index 00000000..71832722 --- /dev/null +++ b/python/tests/harvest_pubmed.py @@ -0,0 +1,78 @@ +""" +Test pubmed FTP harvest. +""" + +import datetime +import json +import os + +import pytest + +from fatcat_tools.harvest import * + + +def test_pubmed_harvest_date(mocker): + +    # mock out the harvest state object so it doesn't try to actually connect +    # to Kafka +    mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') + +    # Mocking a file fetched from FTP, should contain some 'PubmedArticle' elements. +    # $ zcat tests/files/pubmedsample_2019.xml.gz | grep -c '<PubmedArticle>' +    # 176 +    file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_2019.xml.gz') +    ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr') +    ftpretr.return_value = file_to_retrieve + +    test_date = '2020-02-20' + +    # We'll need one entry in the date_file_map. +    generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map') +    generate_date_file_map.return_value = {test_date: set(['dummy'])} + +    # For cleanup. +    os.remove = mocker.Mock() + +    harvester = PubmedFTPWorker( +        kafka_hosts="dummy", +        produce_topic="dummy-produce-topic", +        state_topic="dummy-state-topic", +    ) + +    harvester.producer = mocker.Mock() +    # Since we mock out the FTP fetch, the concrete date does not matter here. +    harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) + +    # check that we published the expected number of DOI objects were published +    # to the (mock) kafka topic +    assert harvester.producer.produce.call_count == 176 +    assert harvester.producer.flush.call_count == 1 +    assert os.remove.call_count == 2 + +def test_pubmed_harvest_date_no_pmid(mocker): +    # mock out the harvest state object so it doesn't try to actually connect +    # to Kafka +    mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') + +    file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_no_pmid_2019.xml.gz') +    ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr') +    ftpretr.return_value = file_to_retrieve + +    test_date = '2020-02-20' + +    # We'll need one entry in the date_file_map. +    generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map') +    generate_date_file_map.return_value = {test_date: set(['dummy'])} + +    harvester = PubmedFTPWorker( +        kafka_hosts="dummy", +        produce_topic="dummy-produce-topic", +        state_topic="dummy-state-topic", +    ) + +    harvester.producer = mocker.Mock() + +    # The file has not PMID, not importable. +    with pytest.raises(ValueError): +        harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) + | 
