aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py7
-rw-r--r--python/fatcat_tools/harvest/harvest_common.py8
-rw-r--r--python/fatcat_tools/harvest/oaipmh.py10
-rw-r--r--python/fatcat_tools/harvest/pubmed.py2
4 files changed, 7 insertions, 20 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index 37628f09..2554fe96 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -1,16 +1,10 @@
-import re
import sys
-import csv
import json
import time
-import itertools
-import datetime
-import requests
from confluent_kafka import Producer, KafkaException
from urllib.parse import urlparse, parse_qs
-from fatcat_tools.workers import most_recent_message
from .harvest_common import HarvestState, requests_retry_session
@@ -64,7 +58,6 @@ class HarvestCrossrefWorker:
to be careful how state is serialized back into kafka.
"""
-
def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email,
api_host_url="https://api.crossref.org/works", start_date=None,
end_date=None):
diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py
index 27ab8b4a..bdae3054 100644
--- a/python/fatcat_tools/harvest/harvest_common.py
+++ b/python/fatcat_tools/harvest/harvest_common.py
@@ -1,15 +1,13 @@
import sys
import json
-import time
import datetime
import requests
from requests.adapters import HTTPAdapter
# unclear why pylint chokes on this import. Recent 'requests' and 'urllib3' are
# in Pipenv.lock, and there are no errors in QA
from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
-from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException, \
- OFFSET_BEGINNING
+from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException
# Used for parsing ISO date format (YYYY-MM-DD)
@@ -130,9 +128,11 @@ class HarvestState:
}).encode('utf-8')
if kafka_topic:
assert(kafka_config)
+
def fail_fast(err, msg):
if err:
raise KafkaException(err)
+
print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr)
producer_conf = kafka_config.copy()
producer_conf.update({
@@ -159,9 +159,11 @@ class HarvestState:
return
print("Fetching state from kafka topic: {}".format(kafka_topic), file=sys.stderr)
+
def fail_fast(err, msg):
if err:
raise KafkaException(err)
+
conf = kafka_config.copy()
conf.update({
'group.id': 'dummy_init_group', # should never be committed
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index d30f9507..c4e4a82a 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -1,16 +1,9 @@
-import re
import sys
-import csv
-import json
import time
-import itertools
-import datetime
-import requests
import sickle
from confluent_kafka import Producer, KafkaException
-from fatcat_tools.workers import most_recent_message
from .harvest_common import HarvestState
@@ -31,7 +24,6 @@ class HarvestOaiPmhWorker:
would want something similar operationally. Oh well!
"""
-
def __init__(self, kafka_hosts, produce_topic, state_topic,
start_date=None, end_date=None):
@@ -69,7 +61,7 @@ class HarvestOaiPmhWorker:
})
producer = Producer(producer_conf)
- api = sickle.Sickle(self.endpoint_url)
+ api = sickle.Sickle(self.endpoint_url, max_retries=5, retry_status_codes=[503])
date_str = date.isoformat()
# this dict kwargs hack is to work around 'from' as a reserved python keyword
# recommended by sickle docs
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index f6301b8d..802d31d8 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -19,7 +19,7 @@ import tempfile
import time
import xml.etree.ElementTree as ET
from ftplib import FTP
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urlparse
import dateparser
from bs4 import BeautifulSoup