diff options
Diffstat (limited to 'python/fatcat_tools')
33 files changed, 46 insertions, 130 deletions
diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py index add03399..13310120 100644 --- a/python/fatcat_tools/api_auth.py +++ b/python/fatcat_tools/api_auth.py @@ -1,7 +1,7 @@ -import os, sys +import os +import sys import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException def public_api(host_uri): diff --git a/python/fatcat_tools/cleanups/common.py b/python/fatcat_tools/cleanups/common.py index 47607cf1..04e6ade4 100644 --- a/python/fatcat_tools/cleanups/common.py +++ b/python/fatcat_tools/cleanups/common.py @@ -5,7 +5,6 @@ import subprocess from collections import Counter from fatcat_openapi_client import ApiClient, Editgroup -from fatcat_openapi_client.rest import ApiException from fatcat_tools.transforms import entity_from_dict, entity_to_dict diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py index ec7e9064..a40e4a28 100644 --- a/python/fatcat_tools/cleanups/files.py +++ b/python/fatcat_tools/cleanups/files.py @@ -1,7 +1,6 @@ from fatcat_openapi_client.rest import ApiException from fatcat_openapi_client.models import FileEntity -from fatcat_tools.transforms import entity_to_dict, entity_from_json from .common import EntityCleaner @@ -70,4 +69,3 @@ class FileCleaner(EntityCleaner): self.api.update_file(self.get_editgroup_id(), entity.ident, entity) return 1 - diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 37628f09..2554fe96 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -1,16 +1,10 @@ -import re import sys -import csv import json import time -import itertools -import datetime -import requests from confluent_kafka import Producer, KafkaException from urllib.parse import urlparse, parse_qs -from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState, requests_retry_session @@ -64,7 +58,6 @@ class HarvestCrossrefWorker: to be careful how state is serialized back into kafka. """ - def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, api_host_url="https://api.crossref.org/works", start_date=None, end_date=None): diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index 27ab8b4a..bdae3054 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -1,15 +1,13 @@ import sys import json -import time import datetime import requests from requests.adapters import HTTPAdapter # unclear why pylint chokes on this import. Recent 'requests' and 'urllib3' are # in Pipenv.lock, and there are no errors in QA from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error -from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException, \ - OFFSET_BEGINNING +from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException # Used for parsing ISO date format (YYYY-MM-DD) @@ -130,9 +128,11 @@ class HarvestState: }).encode('utf-8') if kafka_topic: assert(kafka_config) + def fail_fast(err, msg): if err: raise KafkaException(err) + print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr) producer_conf = kafka_config.copy() producer_conf.update({ @@ -159,9 +159,11 @@ class HarvestState: return print("Fetching state from kafka topic: {}".format(kafka_topic), file=sys.stderr) + def fail_fast(err, msg): if err: raise KafkaException(err) + conf = kafka_config.copy() conf.update({ 'group.id': 'dummy_init_group', # should never be committed diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index d30f9507..a7dc3d8c 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -1,16 +1,9 @@ -import re import sys -import csv -import json import time -import itertools -import datetime -import requests import sickle from confluent_kafka import Producer, KafkaException -from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState @@ -31,7 +24,6 @@ class HarvestOaiPmhWorker: would want something similar operationally. Oh well! """ - def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None): diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index f6301b8d..802d31d8 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -19,7 +19,7 @@ import tempfile import time import xml.etree.ElementTree as ET from ftplib import FTP -from urllib.parse import urljoin, urlparse +from urllib.parse import urlparse import dateparser from bs4 import BeautifulSoup diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index c71b33e9..47a8c4da 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,10 +1,6 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' @@ -186,4 +182,3 @@ class ArabesqueMatchImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 719592fc..43325ebc 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -7,7 +7,7 @@ from bs4 import BeautifulSoup from pylatexenc.latex2text import LatexNodes2Text import fatcat_openapi_client -from .common import EntityImporter, clean +from .common import EntityImporter from .crossref import lookup_license_slug @@ -97,7 +97,6 @@ class ArxivRawImporter(EntityImporter): **kwargs) self._test_override = False - def parse_record(self, record): if not record: @@ -188,7 +187,6 @@ class ArxivRawImporter(EntityImporter): if lang == 'en': lang = None - # extra: # withdrawn_date # translation_of @@ -244,7 +242,7 @@ class ArxivRawImporter(EntityImporter): For each version, do a lookup by full arxiv_id, and store work/release id results. - + If a version has a DOI, also do a doi lookup and store that result. If there is an existing release with both matching, set that as the existing work. If they don't match, use the full arxiv_id match and @@ -345,6 +343,7 @@ class ArxivRawImporter(EntityImporter): print(json.dumps(resp)) #sys.exit(-1) -if __name__=='__main__': + +if __name__ == '__main__': parser = ArxivRawImporter(None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index 536c013b..36a2f9a6 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -82,7 +82,7 @@ def cdl_dash_release(meta, extra=None): #print(abstracts) if not abstracts: abstracts = None - + contribs = [] for creator in meta['creator']: contribs.append(ReleaseContrib( @@ -120,7 +120,7 @@ def make_release_fileset(dat_path): with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp: meta_dict = json.loads(fp.read()) - + release = cdl_dash_release(meta_dict) ark_id = release.extra['ark_id'] diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 375b6051..d5d1cce8 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,7 +1,4 @@ -import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index eafc6546..c0578224 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -161,18 +161,18 @@ def is_cjk(s): return False def test_is_cjk(): - assert is_cjk(None) == False - assert is_cjk('') == False - assert is_cjk('blah') == False - assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True - assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True - assert is_cjk('菊') == True - assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True - assert is_cjk('水道') == True - assert is_cjk('オウ, イク') == True # kanji - assert is_cjk('ひヒ') == True - assert is_cjk('き゚ゅ') == True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True + assert is_cjk(None) is False + assert is_cjk('') is False + assert is_cjk('blah') is False + assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True + assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True + assert is_cjk('菊') is True + assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True + assert is_cjk('水道') is True + assert is_cjk('オウ, イク') is True # kanji + assert is_cjk('ひヒ') is True + assert is_cjk('き゚ゅ') is True + assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True DOMAIN_REL_MAP = { "archive.org": "archive", @@ -368,7 +368,7 @@ class EntityImporter: if self._entity_queue: self.insert_batch(self._entity_queue) self.counts['insert'] += len(self._entity_queue) - self._entity_queue = [] + self._entity_queue = [] return self.counts diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index d26f089f..854e3d9f 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,10 +1,6 @@ -import sys -import json import sqlite3 import datetime -import itertools -import subprocess import fatcat_openapi_client from .common import EntityImporter, clean @@ -425,7 +421,6 @@ class CrossrefImporter(EntityImporter): release_year = raw_date[0] release_date = None - original_title = None if obj.get('original-title'): original_title = clean(obj.get('original-title')[0], force_xml=True) @@ -500,7 +495,7 @@ class CrossrefImporter(EntityImporter): if existing: self.counts['exists'] += 1 return False - + return True def insert_batch(self, batch): @@ -509,4 +504,3 @@ class CrossrefImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 434a2941..08c85b30 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -10,7 +10,6 @@ functions (parse_datacite_...), which may help testing. import collections import datetime -import hashlib import re import json import sqlite3 @@ -292,7 +291,6 @@ class DataciteImporter(EntityImporter): print('[{}] skipping non-ascii doi for now'.format(doi)) return None - creators = attributes.get('creators', []) or [] contributors = attributes.get('contributors', []) or [] # Much fewer than creators. diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 2077eae4..5ec6cc3c 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 -import sys import json import base64 -import datetime import fatcat_openapi_client from .common import EntityImporter, clean, make_rel_url diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 2b630e67..4b1d3702 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,10 +1,6 @@ -import sys -import json -import base64 -import itertools import fatcat_openapi_client -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex +from .common import EntityImporter, make_rel_url class IngestFileResultImporter(EntityImporter): @@ -284,4 +280,3 @@ class SavePaperNowFileImporter(IngestFileResultImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index e30bb233..38aa00eb 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,10 +1,7 @@ import sys -import json import sqlite3 import datetime -import itertools -import subprocess from bs4 import BeautifulSoup import fatcat_openapi_client diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index d439c80a..32782eac 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,7 +1,4 @@ -import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 96dbf947..5d35f5e2 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -183,7 +183,7 @@ class JstorImporter(EntityImporter): # suspect jan 1st dates get set by JSTOR when actual # date not known (citation needed), so drop them release_date = None - + volume = None if article_meta.volume: volume = article_meta.volume.string or None diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 180d7ba3..d95c5847 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -1,12 +1,8 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client from fatcat_tools.normal import * -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS class MatchedImporter(EntityImporter): @@ -160,7 +156,6 @@ class MatchedImporter(EntityImporter): self.counts['skip-update-inflight'] += 1 return False - # minimum viable "existing" URL cleanup to fix dupes and broken links: # remove 'None' wayback URLs, and set archive.org rel 'archive' existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] @@ -207,4 +202,3 @@ class MatchedImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 554e052f..21feea9e 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,7 +1,5 @@ import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean @@ -89,7 +87,7 @@ class OrcidImporter(EntityImporter): if existing: self.counts['exists'] += 1 return False - + return True def insert_batch(self, batch): diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3d3e3a8c..d8a6842c 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -1,11 +1,9 @@ import sys import json -import sqlite3 import datetime import warnings from bs4 import BeautifulSoup -from bs4.element import NavigableString import fatcat_openapi_client from fatcat_tools.normal import * @@ -314,7 +312,7 @@ class PubmedImporter(EntityImporter): Importer for PubMed/MEDLINE XML metadata. If lookup_refs is true, will do identifer-based lookups for all references. - + TODO: MEDLINE doesn't include PMC/OA license; could include in importer? """ @@ -502,7 +500,7 @@ class PubmedImporter(EntityImporter): ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id - + ji = journal.JournalIssue volume = None if ji.find("Volume"): diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 4cd22775..c04e9aa8 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -1,8 +1,4 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client from fatcat_tools.normal import * @@ -192,4 +188,3 @@ class ShadowLibraryImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py index 53b62a37..228de134 100644 --- a/python/fatcat_tools/kafka.py +++ b/python/fatcat_tools/kafka.py @@ -1,5 +1,5 @@ -from confluent_kafka import Consumer, Producer, KafkaException +from confluent_kafka import Producer, KafkaException def kafka_fail_fast(err, msg): diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index f962ff3c..e65af8d6 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -231,4 +231,3 @@ def test_clean_orcid(): assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789" assert clean_orcid("01234567-3456-6780") == None assert clean_orcid("0x23-4567-3456-6780") == None - diff --git a/python/fatcat_tools/reviewers/review_common.py b/python/fatcat_tools/reviewers/review_common.py index 994cec56..336a47f6 100644 --- a/python/fatcat_tools/reviewers/review_common.py +++ b/python/fatcat_tools/reviewers/review_common.py @@ -1,12 +1,10 @@ -import json import time import datetime import subprocess from collections import Counter import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException """ checks should return: diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py index 832ad6aa..ba199efb 100644 --- a/python/fatcat_tools/transforms/csl.py +++ b/python/fatcat_tools/transforms/csl.py @@ -1,6 +1,5 @@ import json -import collections from citeproc import CitationStylesStyle, CitationStylesBibliography from citeproc import Citation, CitationItem @@ -8,8 +7,6 @@ from citeproc import formatter from citeproc.source.json import CiteProcJSON from citeproc_styles import get_style_filepath -from fatcat_openapi_client import ApiClient - def contribs_by_role(contribs, role): ret = [c.copy() for c in contribs if c['role'] == role] @@ -214,14 +211,13 @@ def citeproc_csl(csl_json, style, html=False): lines = bib.bibliography()[0] if style == "bibtex": out = "" - for l in lines: - if l.startswith(" @"): + for line in lines: + if line.startswith(" @"): out += "@" - elif l.startswith(" "): - out += "\n " + l + elif line.startswith(" "): + out += "\n " + line else: - out += l + out += line return ''.join(out) else: return ''.join(lines) - diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 1d35141b..8ec9c164 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,7 +1,5 @@ -import collections import tldextract -from fatcat_openapi_client import ApiClient def check_kbart(year, archive): @@ -14,11 +12,11 @@ def check_kbart(year, archive): def test_check_kbart(): - assert check_kbart(1990, dict()) == None - assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) == False - assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) == True - assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False - assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True + assert check_kbart(1990, dict()) is None + assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) is False + assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) is True + assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) is False + assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True def release_to_elasticsearch(entity, force_bool=True): diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py index ae666413..53455e85 100644 --- a/python/fatcat_tools/transforms/entities.py +++ b/python/fatcat_tools/transforms/entities.py @@ -32,4 +32,3 @@ def entity_from_json(json_str, entity_type, api_client=None): def entity_from_dict(obj, entity_type, api_client=None): json_str = json.dumps(obj) return entity_from_json(json_str, entity_type, api_client=api_client) - diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 22b5154e..2f4e2271 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -61,4 +61,3 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= ingest_request['link_source_id'] = link_source_id return ingest_request - diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 3a49f86e..d5891ad1 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -354,4 +354,3 @@ class EntityUpdatesWorker(FatcatWorker): producer.flush() # TODO: publish updated 'work' entities to a topic consumer.store_offsets(message=msg) - diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index e58b3da1..61854c31 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -1,6 +1,5 @@ import json -import time import requests from confluent_kafka import Consumer, KafkaException @@ -138,7 +137,6 @@ class ElasticsearchReleaseWorker(FatcatWorker): consumer.store_offsets(message=msg) - class ElasticsearchContainerWorker(ElasticsearchReleaseWorker): def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, diff --git a/python/fatcat_tools/workers/worker_common.py b/python/fatcat_tools/workers/worker_common.py index ef79f528..8c2936be 100644 --- a/python/fatcat_tools/workers/worker_common.py +++ b/python/fatcat_tools/workers/worker_common.py @@ -1,15 +1,6 @@ -import re -import sys -import csv -import json -import itertools -from itertools import islice from confluent_kafka import Consumer, KafkaException, TopicPartition -import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException - def most_recent_message(topic, kafka_config): """ |