diff options
Diffstat (limited to 'python/fatcat_tools')
32 files changed, 116 insertions, 71 deletions
diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py index 13310120..bbf059c0 100644 --- a/python/fatcat_tools/api_auth.py +++ b/python/fatcat_tools/api_auth.py @@ -1,6 +1,7 @@ import os import sys + import fatcat_openapi_client diff --git a/python/fatcat_tools/cleanups/common.py b/python/fatcat_tools/cleanups/common.py index 04e6ade4..d0fcc761 100644 --- a/python/fatcat_tools/cleanups/common.py +++ b/python/fatcat_tools/cleanups/common.py @@ -1,10 +1,11 @@ -import json import copy +import json import subprocess from collections import Counter from fatcat_openapi_client import ApiClient, Editgroup + from fatcat_tools.transforms import entity_from_dict, entity_to_dict diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py index 10dd45cc..0d275ba6 100644 --- a/python/fatcat_tools/cleanups/files.py +++ b/python/fatcat_tools/cleanups/files.py @@ -1,6 +1,6 @@ -from fatcat_openapi_client.rest import ApiException from fatcat_openapi_client.models import FileEntity +from fatcat_openapi_client.rest import ApiException from .common import EntityCleaner diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py index 4194ea63..0987d10d 100644 --- a/python/fatcat_tools/fcid.py +++ b/python/fatcat_tools/fcid.py @@ -2,6 +2,7 @@ import base64 import uuid + def fcid2uuid(s): """ Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 553f4e7a..d441d495 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -1,9 +1,10 @@ -import sys import json +import sys import time -from confluent_kafka import Producer, KafkaException -from urllib.parse import urlparse, parse_qs +from urllib.parse import parse_qs, urlparse + +from confluent_kafka import KafkaException, Producer from .harvest_common import HarvestState, requests_retry_session diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index 5e7702d9..45c2b8ea 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -1,14 +1,15 @@ -import sys -import json import datetime +import json +import sys + import requests +from confluent_kafka import Consumer, KafkaException, Producer, TopicPartition from requests.adapters import HTTPAdapter + # unclear why pylint chokes on this import. Recent 'requests' and 'urllib3' are # in Pipenv.lock, and there are no errors in QA -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error -from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException - +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error # Used for parsing ISO date format (YYYY-MM-DD) DATE_FMT = "%Y-%m-%d" diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index c4e4a82a..0eb0343d 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -1,8 +1,9 @@ import sys import time + import sickle -from confluent_kafka import Producer, KafkaException +from confluent_kafka import KafkaException, Producer from .harvest_common import HarvestState diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index c8f7c77c..2b0ff7ec 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,7 +1,7 @@ import fatcat_openapi_client -from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex +from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 43325ebc..fc429fb0 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -1,16 +1,16 @@ +import datetime +import json import re import sys -import json -import datetime + +import fatcat_openapi_client from bs4 import BeautifulSoup from pylatexenc.latex2text import LatexNodes2Text -import fatcat_openapi_client from .common import EntityImporter from .crossref import lookup_license_slug - latex2text = LatexNodes2Text() def latex_to_text(raw): diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 5c9efe94..0b634e73 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,5 +1,6 @@ import fatcat_openapi_client + from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 9d22ce83..e33a2012 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -1,32 +1,32 @@ -import re -import sys import csv +import datetime import json +import re import sqlite3 -import datetime import subprocess -from collections import Counter -from typing import Dict, Any, List, Optional, Tuple -import lxml +import sys import xml.etree.ElementTree as ET +from collections import Counter +from typing import Any, Dict, List, Optional, Tuple import elasticsearch +import fatcat_openapi_client +import fuzzycat.common +import fuzzycat.verify +import lxml from bs4 import BeautifulSoup from confluent_kafka import Consumer, KafkaException - -import fatcat_openapi_client from fatcat_openapi_client import ReleaseEntity from fatcat_openapi_client.rest import ApiException from fuzzycat.matching import match_release_fuzzy -import fuzzycat.common -import fuzzycat.verify # TODO: refactor so remove need for this (re-imports for backwards compatibility) -from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 +from fatcat_tools.normal import LANG_MAP_MARC, b32_hex +from fatcat_tools.normal import clean_str as clean # noqa: F401 +from fatcat_tools.normal import is_cjk from fatcat_tools.transforms import entity_to_dict - DATE_FMT: str = "%Y-%m-%d" SANE_MAX_RELEASES: int = 200 SANE_MAX_URLS: int = 100 diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 38c19a63..fd6936a4 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,14 +1,13 @@ -import sqlite3 import datetime -from typing import Dict, Optional, Any +import sqlite3 +from typing import Any, Dict, Optional import fatcat_openapi_client from fatcat_openapi_client import ReleaseEntity from .common import EntityImporter, clean - # The docs/guide should be the canonical home for these mappings; update there # first # Can get a list of Crossref types (with counts) via API: diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 1593e6f8..a06c68a4 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -10,8 +10,8 @@ functions (parse_datacite_...), which may help testing. import collections import datetime -import re import json +import re import sqlite3 import sys diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py index a9f993a8..3d280fb7 100644 --- a/python/fatcat_tools/importers/dblp_container.py +++ b/python/fatcat_tools/importers/dblp_container.py @@ -7,8 +7,9 @@ pre-scraped in to JSON from HTML pages. import sys # noqa: F401 import fatcat_openapi_client -from fatcat_tools.normal import clean_str + from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.normal import clean_str class DblpContainerImporter(EntityImporter): diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index fa5cb842..6d028f2f 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -21,18 +21,25 @@ brittle/unreliable TSV lookup mechanism for prefix-to-container_id (as of December 2020). """ -import sys # noqa: F401 +import datetime import json +import sys # noqa: F401 import warnings -import datetime -from typing import List, Optional, Any +from typing import Any, List, Optional import fatcat_openapi_client -from fatcat_tools.normal import (clean_doi, clean_str, parse_month, - clean_orcid, clean_hdl, - clean_arxiv_id, clean_wikidata_qid, clean_isbn13) from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.normal import ( + clean_arxiv_id, + clean_doi, + clean_hdl, + clean_isbn13, + clean_orcid, + clean_str, + clean_wikidata_qid, + parse_month, +) from fatcat_tools.transforms import entity_to_dict diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 833089ae..1831c4cd 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -4,15 +4,24 @@ Importer for DOAJ article-level metadata, schema v1. DOAJ API schema and docs: https://doaj.org/api/v1/docs """ -import warnings import datetime +import warnings from typing import List, Optional import fatcat_openapi_client -from fatcat_tools.normal import (clean_doi, clean_str, parse_month, - clean_orcid, detect_text_lang, parse_lang_name, parse_country_name, - clean_pmid, clean_pmcid) + from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.normal import ( + clean_doi, + clean_orcid, + clean_pmcid, + clean_pmid, + clean_str, + detect_text_lang, + parse_country_name, + parse_lang_name, + parse_month, +) # Cutoff length for abstracts. MAX_ABSTRACT_LENGTH = 2048 diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 3d9f5923..0951ed84 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -1,5 +1,6 @@ import fatcat_openapi_client + from .common import EntityImporter diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py index 13352fb2..43c2a49c 100644 --- a/python/fatcat_tools/importers/fileset_generic.py +++ b/python/fatcat_tools/importers/fileset_generic.py @@ -2,6 +2,7 @@ import fatcat_openapi_client from fatcat_tools import entity_from_dict + from .common import EntityImporter diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index a811c856..0f666652 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 -import json import base64 +import json + import fatcat_openapi_client + from .common import EntityImporter, clean, make_rel_url MAX_ABSTRACT_BYTES=4096 diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4d4efc0a..f0943c1e 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -2,6 +2,7 @@ import datetime import fatcat_openapi_client + from .common import EntityImporter, make_rel_url diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 12f5450f..0a983c5e 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,12 +1,14 @@ -import sys -import sqlite3 import datetime -from bs4 import BeautifulSoup +import sqlite3 +import sys import fatcat_openapi_client +from bs4 import BeautifulSoup + from fatcat_tools.normal import clean_doi -from .common import EntityImporter, clean, is_cjk, DATE_FMT + +from .common import DATE_FMT, EntityImporter, clean, is_cjk def parse_jalc_persons(raw_persons): diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 9f3b429f..25d7b3b5 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,5 +1,6 @@ import fatcat_openapi_client + from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 5d35f5e2..d37424d6 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,12 +1,13 @@ -import sys -import json import datetime +import json +import sys import warnings -from bs4 import BeautifulSoup import fatcat_openapi_client -from .common import EntityImporter, clean, LANG_MAP_MARC +from bs4 import BeautifulSoup + +from .common import LANG_MAP_MARC, EntityImporter, clean from .crossref import CONTAINER_TYPE_MAP # TODO: more entries? diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 4412a46d..3bdd23a1 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,8 +1,11 @@ import sys + import fatcat_openapi_client + from .common import EntityImporter, clean + def value_or_none(e): if type(e) == dict: e = e.get('value') diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py index 228de134..32749db2 100644 --- a/python/fatcat_tools/kafka.py +++ b/python/fatcat_tools/kafka.py @@ -1,5 +1,5 @@ -from confluent_kafka import Producer, KafkaException +from confluent_kafka import KafkaException, Producer def kafka_fail_fast(err, msg): diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 24c0bb0a..9b65e768 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -4,10 +4,10 @@ A bunch of helpers to parse and normalize strings: external identifiers, free-form input, titles, etc. """ -import re import base64 -from typing import Optional, Union +import re import unicodedata +from typing import Optional, Union import ftfy import langdetect diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 3a2709a4..8361b260 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -5,18 +5,18 @@ index of reference links between works in the main catalog. See bulk citation and citation API proposals for design documentation. """ -import sys -import datetime import argparse -from typing import Optional, List, Any, Dict, Union +import datetime +import sys +from typing import Any, Dict, List, Optional, Union -from pydantic import BaseModel, validator import elasticsearch from elasticsearch_dsl import Search from fatcat_openapi_client import ReleaseEntity +from pydantic import BaseModel, validator from fatcat_tools import public_api -from fatcat_tools.transforms.access import release_access_options, AccessOption +from fatcat_tools.transforms.access import AccessOption, release_access_options from fatcat_tools.transforms.entities import entity_to_dict diff --git a/python/fatcat_tools/reviewers/review_common.py b/python/fatcat_tools/reviewers/review_common.py index b4930c19..867d826d 100644 --- a/python/fatcat_tools/reviewers/review_common.py +++ b/python/fatcat_tools/reviewers/review_common.py @@ -1,9 +1,9 @@ -import time import datetime import subprocess +import time from collections import Counter -from typing import Optional, List, Any +from typing import Any, List, Optional import fatcat_openapi_client diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py index 39d4c6d3..ae9880e7 100644 --- a/python/fatcat_tools/transforms/access.py +++ b/python/fatcat_tools/transforms/access.py @@ -1,9 +1,9 @@ from enum import Enum -from typing import Optional, List +from typing import List, Optional -from pydantic import BaseModel from fatcat_openapi_client import ReleaseEntity +from pydantic import BaseModel class AccessType(str, Enum): diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py index 0556f4fe..f8b26bce 100644 --- a/python/fatcat_tools/transforms/csl.py +++ b/python/fatcat_tools/transforms/csl.py @@ -1,9 +1,13 @@ import json -from citeproc import CitationStylesStyle, CitationStylesBibliography -from citeproc import Citation, CitationItem -from citeproc import formatter +from citeproc import ( + Citation, + CitationItem, + CitationStylesBibliography, + CitationStylesStyle, + formatter, +) from citeproc.source.json import CiteProcJSON from citeproc_styles import get_style_filepath diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index ec5891c3..1826d4eb 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,10 +1,15 @@ import datetime -from typing import Dict, Any, Optional +from typing import Any, Dict, Optional import tldextract - -from fatcat_openapi_client import ReleaseEntity, ContainerEntity, EntityEdit, ChangelogEntry, FileEntity +from fatcat_openapi_client import ( + ChangelogEntry, + ContainerEntity, + EntityEdit, + FileEntity, + ReleaseEntity, +) def check_kbart(year: int, archive: dict) -> Optional[bool]: diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 982ee3ea..a61e364c 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -1,7 +1,8 @@ import json import time -from confluent_kafka import Consumer, Producer, KafkaException + +from confluent_kafka import Consumer, KafkaException, Producer from fatcat_tools.transforms import release_ingest_request, release_to_elasticsearch |