diff options
author | Martin Czygan <martin@archive.org> | 2020-07-06 18:53:24 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2020-07-06 18:53:24 +0000 |
commit | 68cf95bd1d1588c0d3170b4032596756e07ae718 (patch) | |
tree | 3163a803dd6743c84c83a786b5aea7eda3bbca8e /python/fatcat_tools/harvest | |
parent | bea909f997bcef51e2624b9eea42c8fbe7115aaa (diff) | |
parent | 8583c6866f2bb89e8dfe5f5e5893048c2fd854e7 (diff) | |
download | fatcat-68cf95bd1d1588c0d3170b4032596756e07ae718.tar.gz fatcat-68cf95bd1d1588c0d3170b4032596756e07ae718.zip |
Merge branch 'bnewbold-lint' into 'master'
lint cleanups
See merge request webgroup/fatcat!62
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/harvest/harvest_common.py | 8 | ||||
-rw-r--r-- | python/fatcat_tools/harvest/oaipmh.py | 8 | ||||
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 2 |
4 files changed, 6 insertions, 19 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 37628f09..2554fe96 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -1,16 +1,10 @@ -import re import sys -import csv import json import time -import itertools -import datetime -import requests from confluent_kafka import Producer, KafkaException from urllib.parse import urlparse, parse_qs -from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState, requests_retry_session @@ -64,7 +58,6 @@ class HarvestCrossrefWorker: to be careful how state is serialized back into kafka. """ - def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, api_host_url="https://api.crossref.org/works", start_date=None, end_date=None): diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index 27ab8b4a..bdae3054 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -1,15 +1,13 @@ import sys import json -import time import datetime import requests from requests.adapters import HTTPAdapter # unclear why pylint chokes on this import. Recent 'requests' and 'urllib3' are # in Pipenv.lock, and there are no errors in QA from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error -from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException, \ - OFFSET_BEGINNING +from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException # Used for parsing ISO date format (YYYY-MM-DD) @@ -130,9 +128,11 @@ class HarvestState: }).encode('utf-8') if kafka_topic: assert(kafka_config) + def fail_fast(err, msg): if err: raise KafkaException(err) + print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr) producer_conf = kafka_config.copy() producer_conf.update({ @@ -159,9 +159,11 @@ class HarvestState: return print("Fetching state from kafka topic: {}".format(kafka_topic), file=sys.stderr) + def fail_fast(err, msg): if err: raise KafkaException(err) + conf = kafka_config.copy() conf.update({ 'group.id': 'dummy_init_group', # should never be committed diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index d30f9507..a7dc3d8c 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -1,16 +1,9 @@ -import re import sys -import csv -import json import time -import itertools -import datetime -import requests import sickle from confluent_kafka import Producer, KafkaException -from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState @@ -31,7 +24,6 @@ class HarvestOaiPmhWorker: would want something similar operationally. Oh well! """ - def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None): diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index f6301b8d..802d31d8 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -19,7 +19,7 @@ import tempfile import time import xml.etree.ElementTree as ET from ftplib import FTP -from urllib.parse import urljoin, urlparse +from urllib.parse import urlparse import dateparser from bs4 import BeautifulSoup |