diff options
103 files changed, 319 insertions, 341 deletions
diff --git a/extra/bulk_download/README.md b/extra/bulk_download/README.md new file mode 100644 index 00000000..83b92fd9 --- /dev/null +++ b/extra/bulk_download/README.md @@ -0,0 +1,40 @@ + +## Download Fatcat Fulltext from web.archive.org in Bulk + +These quick-and-dirty directions use UNIX utilities to download from the +Internet Archive (either in the wayback machine or archive.org). To make a +proper mirror (eg, for research or preservation use), you would want to verify +hashes (fixity), handle additional retries, and handle files which are not +preserved in Internet Archive, retain linkage between files and fatcat +identifiers, etc. + +You can download a file entity dump from the most recent "Bulk Metadata Export" +item from the [snapshots and exports collection](https://archive.org/details/fatcat_snapshots_and_exports?sort=-publicdate). + +Create a TSV file containing the SHA1 and a single URL for each file +entity: + + zcat file_export.json.gz \ + | grep '"application/pdf"' + | jq -cr '.sha1 as $sha1 | .urls | map(select((.url | startswith("https://web.archive.org/web/")) or (.url | startswith("https://archive.org/download/")))) | select(. != []) | [$sha1, .[0].url] | @tsv' \ + > fatcat_files_sha1_iaurl.tsv + +Then use the GNU `parallel` command to call `curl` in parallel to fetch files. +The `-j` argument controls parallelism. Please don't create exessive load on +Internet Archive infrastructure by downloading with too many threads. 10 +parallel threads is a decent amount of load. + + cat fatcat_files_sha1_iaurl.tsv \ + | awk '{print "curl -Lfs --write-out \"%{http_code}\\t" $1 "\\t%{url_effective}\\n\" \"" $2 "\" -o ", $1 ".pdf"}' \ + | parallel --bar -j4 {} \ + > fetch_status.log + +This will write out a status log containing the HTTP status code, expected file +SHA1, and attempted URL. You can check for errors (and potentially try) with: + + grep -v "^200" fetch_status.log + +Or, count status codes: + + cut -f1 fetch_status.log | sort | uniq -c | sort -nr + diff --git a/extra/elasticsearch/sql_queries.md b/extra/elasticsearch/sql_queries.md new file mode 100644 index 00000000..3ea168e5 --- /dev/null +++ b/extra/elasticsearch/sql_queries.md @@ -0,0 +1,8 @@ + +Top missing OA journals by `container_id`: + + POST _xpack/sql?format=txt + { + "query": "SELECT container_id, count(*) from fatcat_release WHERE preservation = 'none' AND is_oa = true GROUP BY container_id ORDER BY count(*) DESC LIMIT 20" + } + diff --git a/notes/bulk_edits/2020-03-23_jalc.md b/notes/bulk_edits/2020-03-23_jalc.md new file mode 100644 index 00000000..d63c3759 --- /dev/null +++ b/notes/bulk_edits/2020-03-23_jalc.md @@ -0,0 +1,23 @@ + +2019-10-01 JaLC metadata snapshot: <https://archive.org/download/jalc-bulk-metadata-2019> + +Extracted .rdf file instead of piping it through zcat. + +Use correct bot: + + export FATCAT_AUTH_WORKER_JALC=blah + +Start small; do a random bunch (10k) single-threaded to pre-create containers: + + head -n100 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + shuf -n100 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + shuf -n10000 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + +Seemed like lots of individual containers getting added after repeating, so +just going to import single-threaded to avoid duplicate container creation: + + cat /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + => Counter({'total': 8419745, 'exists': 6480683, 'insert': 1934082, 'skip': 4980, 'inserted.container': 134, 'update': 0}) + +Had a bit fewer than 4,568,120 "doi_registrar:jalc" releases before this +import, 6,502,202 after (based on `doi_registrar:jalc` query). diff --git a/notes/cleanup_tasks.txt b/notes/cleanup_tasks.txt new file mode 100644 index 00000000..bf418e59 --- /dev/null +++ b/notes/cleanup_tasks.txt @@ -0,0 +1,18 @@ + +Cambridge Chemical Database (NCI) + + doi_prefix:10.3406 release_type:article + + 193,346+ entities + + should be 'dataset' not 'article' + + datacite importer + +Frontiers + + Frontiers non-PDF abstracts, which have DOIs like `10.3389/conf.*`. Should + crawl these, but `release_type` should be... `abstract`? There are at least + 18,743 of these. Should be fixed in both crossref-bot, then a retro-active + cleanup. + diff --git a/notes/example_entities.txt b/notes/example_entities.txt new file mode 100644 index 00000000..416da610 --- /dev/null +++ b/notes/example_entities.txt @@ -0,0 +1,26 @@ + +errata/update: + Fourth Test of General Relativity: Preliminary Results + 10.1103/physrevlett.20.1265 + 10.1103/physrevlett.21.266.3 + + same title; later is errata to the first. + very minor: The term "baud length" was consistently misprinted as "band length." + +DOIs for individual images + https://commons.wikimedia.org/wiki/Category:Media_from_Williams_et_al._2010_-_10.1371/journal.pone.0010676 + +long-tail journal not in fatcat; web-native, tricky to crawl + https://angryoldmanmagazine.com/ + +dataset + "ISSN-Matching of Gold OA Journals (ISSN-GOLD-OA) 2.0" + https://pub.uni-bielefeld.de/data/2913654 + 2 files + has DOI: 10.4119/unibi/2913654 + +release group; single PDF is valid copy of two DOIs: + https://fatcat.wiki/file/wr64e37yvfcidgbowtslx7omne + 10.5167/uzh-146424 + 10.1016/j.physletb.2017.12.006 + ALSO: has CC-BY license_slug diff --git a/notes/merge_releases_examples.txt b/notes/merge_releases_examples.txt new file mode 100644 index 00000000..ca65705e --- /dev/null +++ b/notes/merge_releases_examples.txt @@ -0,0 +1,21 @@ + +https://fatcat.wiki/release/search?q=Validation+of+middle-atmospheric+campaign-based+water+vapour+measured+by+the+ground-based+microwave+radiometer + + 4 releases, all dois. 3x have same author list, 1 same authors different order + +https://fatcat.wiki/release/search?q=Perspectives+and+pregnancy+outcomes+of+maternal+Ramadan+fasting+in+the+second+trimester+of+pregnancy + + 6 releases: + 2 figshare article + 2 figshare files + 1 primary + 1 correction + +https://figshare.com/articles/Plasmodium_falciparum_evades_innate_immunity_by_hybrid_ABO_blood_group_phenotype_formation/8208689/119 + + 119 versions (!) + +https://fatcat.wiki/release/search?q=NeuroTrends+Visualization + + 45 versions across two figshare works + diff --git a/python/.flake8 b/python/.flake8 new file mode 100644 index 00000000..34f6131c --- /dev/null +++ b/python/.flake8 @@ -0,0 +1,13 @@ +[flake8] +# TODO: ANN for better annotation coverage +select = C,E,F,W +# The ignores starting with "E251" should be removed after using 'black' +ignore = F405,F403,W503,E231,E203,E501,E226,E711,E713,E265,ANN101,ANN204,ANN102,E251,E128,E302,E261,E241,E201,E202,E266,E124,E305,E225,W504,E123,E122,E125,E121,E129,E126,E712,W191,E101 +# TODO: should reduce max-complexity +max-complexity = 50 +exclude = .git,__pycache__,.venv +max-line-length = 120 +per-file-ignores = + */__init__.py: F401 + tests/*.py: F401,F811 + tests/transform_csl.py: W291 diff --git a/python/Makefile b/python/Makefile index 182bc739..4c8ff45f 100644 --- a/python/Makefile +++ b/python/Makefile @@ -6,14 +6,34 @@ SHELL = /bin/bash help: ## Print info about all commands @echo "Commands:" @echo - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + +.PHONY: dep +dep: ## Create local virtualenv using pipenv + pipenv install --dev + +.PHONY: lint +lint: ## Run lints (eg, flake8, mypy) + pipenv run flake8 *.py tests/ fatcat_web/ fatcat_tools/ --select=E9,F63,F7,F82 + pipenv run flake8 *.py tests/ fatcat_web/ fatcat_tools/ --exit-zero + +.PHONY: mypy +mypy: ## Run mypy type checks (not part of regular lint yet) + pipenv run mypy *.py fatcat_web/ fatcat_tools/ --ignore-missing-imports + +# Not ready for 'black' yet +#.PHONY: fmt +#fmt: ## Run code formating on all source code +# pipenv run black *.py fatcat_web/ fatcat_tools/ tests/ .PHONY: test -test: ## Run all tests and lints - curl --silent localhost:9411/v0/changelog > /dev/null || (echo "API not running locally, bailing early from tests" && exit 1) +test: lint ## Run all tests and lints + @curl --silent localhost:9411/v0/changelog > /dev/null || (echo "API not running locally, bailing early from tests" && exit 1) pipenv run pytest - pipenv run pylint -j 0 -E fatcat*.py fatcat_tools fatcat_web tests/*.py - pipenv run flake8 tests/ fatcat_web/ fatcat_tools/ *.py --count --select=E9,F63,F7,F82 --show-source --statistics + +.PHONY: coverage +coverage: ## Run all tests with coverage + pipenv run pytest --cov .PHONY: test-cli test-cli: ## Run CLI commands. WARNING: may mutate local database diff --git a/python/TODO b/python/TODO index fdb72849..52b2b8fe 100644 --- a/python/TODO +++ b/python/TODO @@ -1,4 +1,14 @@ +improve argparse usage + change --host-url to --fatcat-api-url + add 'help=' to all CLI sub-commands; improves --help output + do ArgumentDefaultsHelpFormatter everywhere + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + +Try stubgen for type annotation:: + stubgen -m fatcat_openapi_client -o stubs/ + stubgen -p fatcat_openapi_client -o stubs/ + - schema.org metadata for releases additional tests diff --git a/python/fatcat_cleanup.py b/python/fatcat_cleanup.py index d8b2aea2..4e11139e 100755 --- a/python/fatcat_cleanup.py +++ b/python/fatcat_cleanup.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 -import os, sys, argparse +import os +import sys +import argparse import raven from fatcat_tools import authenticated_api diff --git a/python/fatcat_export.py b/python/fatcat_export.py index 5419e46c..763c217e 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -11,11 +11,7 @@ import sys import json import argparse -import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException -from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ChangelogEntry -from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ - public_api +from fatcat_tools import uuid2fcid, entity_to_dict, public_api def run_export_releases(args): diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 331cf791..252ab3a5 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 -import os, sys, argparse +import os +import sys +import argparse import raven from fatcat_tools import authenticated_api diff --git a/python/fatcat_review.py b/python/fatcat_review.py index 1d1db9a5..a10fc34b 100755 --- a/python/fatcat_review.py +++ b/python/fatcat_review.py @@ -2,11 +2,10 @@ import sys import argparse -import datetime import raven from fatcat_tools import authenticated_api -from fatcat_tools.reviewers import DummyReviewBot, ReviewBot +from fatcat_tools.reviewers import DummyReviewBot # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable sentry_client = raven.Client() diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py index add03399..13310120 100644 --- a/python/fatcat_tools/api_auth.py +++ b/python/fatcat_tools/api_auth.py @@ -1,7 +1,7 @@ -import os, sys +import os +import sys import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException def public_api(host_uri): diff --git a/python/fatcat_tools/cleanups/common.py b/python/fatcat_tools/cleanups/common.py index 47607cf1..04e6ade4 100644 --- a/python/fatcat_tools/cleanups/common.py +++ b/python/fatcat_tools/cleanups/common.py @@ -5,7 +5,6 @@ import subprocess from collections import Counter from fatcat_openapi_client import ApiClient, Editgroup -from fatcat_openapi_client.rest import ApiException from fatcat_tools.transforms import entity_from_dict, entity_to_dict diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py index ec7e9064..a40e4a28 100644 --- a/python/fatcat_tools/cleanups/files.py +++ b/python/fatcat_tools/cleanups/files.py @@ -1,7 +1,6 @@ from fatcat_openapi_client.rest import ApiException from fatcat_openapi_client.models import FileEntity -from fatcat_tools.transforms import entity_to_dict, entity_from_json from .common import EntityCleaner @@ -70,4 +69,3 @@ class FileCleaner(EntityCleaner): self.api.update_file(self.get_editgroup_id(), entity.ident, entity) return 1 - diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 37628f09..2554fe96 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -1,16 +1,10 @@ -import re import sys -import csv import json import time -import itertools -import datetime -import requests from confluent_kafka import Producer, KafkaException from urllib.parse import urlparse, parse_qs -from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState, requests_retry_session @@ -64,7 +58,6 @@ class HarvestCrossrefWorker: to be careful how state is serialized back into kafka. """ - def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, api_host_url="https://api.crossref.org/works", start_date=None, end_date=None): diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index 27ab8b4a..bdae3054 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -1,15 +1,13 @@ import sys import json -import time import datetime import requests from requests.adapters import HTTPAdapter # unclear why pylint chokes on this import. Recent 'requests' and 'urllib3' are # in Pipenv.lock, and there are no errors in QA from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error -from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException, \ - OFFSET_BEGINNING +from confluent_kafka import Producer, Consumer, TopicPartition, KafkaException # Used for parsing ISO date format (YYYY-MM-DD) @@ -130,9 +128,11 @@ class HarvestState: }).encode('utf-8') if kafka_topic: assert(kafka_config) + def fail_fast(err, msg): if err: raise KafkaException(err) + print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr) producer_conf = kafka_config.copy() producer_conf.update({ @@ -159,9 +159,11 @@ class HarvestState: return print("Fetching state from kafka topic: {}".format(kafka_topic), file=sys.stderr) + def fail_fast(err, msg): if err: raise KafkaException(err) + conf = kafka_config.copy() conf.update({ 'group.id': 'dummy_init_group', # should never be committed diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index d30f9507..c4e4a82a 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -1,16 +1,9 @@ -import re import sys -import csv -import json import time -import itertools -import datetime -import requests import sickle from confluent_kafka import Producer, KafkaException -from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState @@ -31,7 +24,6 @@ class HarvestOaiPmhWorker: would want something similar operationally. Oh well! """ - def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None): @@ -69,7 +61,7 @@ class HarvestOaiPmhWorker: }) producer = Producer(producer_conf) - api = sickle.Sickle(self.endpoint_url) + api = sickle.Sickle(self.endpoint_url, max_retries=5, retry_status_codes=[503]) date_str = date.isoformat() # this dict kwargs hack is to work around 'from' as a reserved python keyword # recommended by sickle docs diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index f6301b8d..802d31d8 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -19,7 +19,7 @@ import tempfile import time import xml.etree.ElementTree as ET from ftplib import FTP -from urllib.parse import urljoin, urlparse +from urllib.parse import urlparse import dateparser from bs4 import BeautifulSoup diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index c71b33e9..47a8c4da 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,10 +1,6 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' @@ -186,4 +182,3 @@ class ArabesqueMatchImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 719592fc..43325ebc 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -7,7 +7,7 @@ from bs4 import BeautifulSoup from pylatexenc.latex2text import LatexNodes2Text import fatcat_openapi_client -from .common import EntityImporter, clean +from .common import EntityImporter from .crossref import lookup_license_slug @@ -97,7 +97,6 @@ class ArxivRawImporter(EntityImporter): **kwargs) self._test_override = False - def parse_record(self, record): if not record: @@ -188,7 +187,6 @@ class ArxivRawImporter(EntityImporter): if lang == 'en': lang = None - # extra: # withdrawn_date # translation_of @@ -244,7 +242,7 @@ class ArxivRawImporter(EntityImporter): For each version, do a lookup by full arxiv_id, and store work/release id results. - + If a version has a DOI, also do a doi lookup and store that result. If there is an existing release with both matching, set that as the existing work. If they don't match, use the full arxiv_id match and @@ -345,6 +343,7 @@ class ArxivRawImporter(EntityImporter): print(json.dumps(resp)) #sys.exit(-1) -if __name__=='__main__': + +if __name__ == '__main__': parser = ArxivRawImporter(None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index 536c013b..36a2f9a6 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -82,7 +82,7 @@ def cdl_dash_release(meta, extra=None): #print(abstracts) if not abstracts: abstracts = None - + contribs = [] for creator in meta['creator']: contribs.append(ReleaseContrib( @@ -120,7 +120,7 @@ def make_release_fileset(dat_path): with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp: meta_dict = json.loads(fp.read()) - + release = cdl_dash_release(meta_dict) ark_id = release.extra['ark_id'] diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 375b6051..d5d1cce8 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,7 +1,4 @@ -import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index eafc6546..c0578224 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -161,18 +161,18 @@ def is_cjk(s): return False def test_is_cjk(): - assert is_cjk(None) == False - assert is_cjk('') == False - assert is_cjk('blah') == False - assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True - assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True - assert is_cjk('菊') == True - assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True - assert is_cjk('水道') == True - assert is_cjk('オウ, イク') == True # kanji - assert is_cjk('ひヒ') == True - assert is_cjk('き゚ゅ') == True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True + assert is_cjk(None) is False + assert is_cjk('') is False + assert is_cjk('blah') is False + assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True + assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True + assert is_cjk('菊') is True + assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True + assert is_cjk('水道') is True + assert is_cjk('オウ, イク') is True # kanji + assert is_cjk('ひヒ') is True + assert is_cjk('き゚ゅ') is True + assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True DOMAIN_REL_MAP = { "archive.org": "archive", @@ -368,7 +368,7 @@ class EntityImporter: if self._entity_queue: self.insert_batch(self._entity_queue) self.counts['insert'] += len(self._entity_queue) - self._entity_queue = [] + self._entity_queue = [] return self.counts diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index d26f089f..854e3d9f 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,10 +1,6 @@ -import sys -import json import sqlite3 import datetime -import itertools -import subprocess import fatcat_openapi_client from .common import EntityImporter, clean @@ -425,7 +421,6 @@ class CrossrefImporter(EntityImporter): release_year = raw_date[0] release_date = None - original_title = None if obj.get('original-title'): original_title = clean(obj.get('original-title')[0], force_xml=True) @@ -500,7 +495,7 @@ class CrossrefImporter(EntityImporter): if existing: self.counts['exists'] += 1 return False - + return True def insert_batch(self, batch): @@ -509,4 +504,3 @@ class CrossrefImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 962d80c6..6aeb6a68 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -10,7 +10,6 @@ functions (parse_datacite_...), which may help testing. import collections import datetime -import hashlib import re import json import sqlite3 @@ -765,7 +764,7 @@ class DataciteImporter(EntityImporter): nameType = c.get('nameType', '') or '' if nameType in ('', 'Personal'): creator_id = None - for nid in c.get('nameIdentifiers', []): + for nid in c.get('nameIdentifiers', []) or []: name_scheme = nid.get('nameIdentifierScheme', '') or '' if not name_scheme.lower() == "orcid": continue diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 2077eae4..5ec6cc3c 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 -import sys import json import base64 -import datetime import fatcat_openapi_client from .common import EntityImporter, clean, make_rel_url diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 2b630e67..4b1d3702 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,10 +1,6 @@ -import sys -import json -import base64 -import itertools import fatcat_openapi_client -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex +from .common import EntityImporter, make_rel_url class IngestFileResultImporter(EntityImporter): @@ -284,4 +280,3 @@ class SavePaperNowFileImporter(IngestFileResultImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index e30bb233..38aa00eb 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,10 +1,7 @@ import sys -import json import sqlite3 import datetime -import itertools -import subprocess from bs4 import BeautifulSoup import fatcat_openapi_client diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index d439c80a..32782eac 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,7 +1,4 @@ -import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 96dbf947..5d35f5e2 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -183,7 +183,7 @@ class JstorImporter(EntityImporter): # suspect jan 1st dates get set by JSTOR when actual # date not known (citation needed), so drop them release_date = None - + volume = None if article_meta.volume: volume = article_meta.volume.string or None diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 180d7ba3..d95c5847 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -1,12 +1,8 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client from fatcat_tools.normal import * -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS class MatchedImporter(EntityImporter): @@ -160,7 +156,6 @@ class MatchedImporter(EntityImporter): self.counts['skip-update-inflight'] += 1 return False - # minimum viable "existing" URL cleanup to fix dupes and broken links: # remove 'None' wayback URLs, and set archive.org rel 'archive' existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] @@ -207,4 +202,3 @@ class MatchedImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 554e052f..21feea9e 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,7 +1,5 @@ import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean @@ -89,7 +87,7 @@ class OrcidImporter(EntityImporter): if existing: self.counts['exists'] += 1 return False - + return True def insert_batch(self, batch): diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3d3e3a8c..d8a6842c 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -1,11 +1,9 @@ import sys import json -import sqlite3 import datetime import warnings from bs4 import BeautifulSoup -from bs4.element import NavigableString import fatcat_openapi_client from fatcat_tools.normal import * @@ -314,7 +312,7 @@ class PubmedImporter(EntityImporter): Importer for PubMed/MEDLINE XML metadata. If lookup_refs is true, will do identifer-based lookups for all references. - + TODO: MEDLINE doesn't include PMC/OA license; could include in importer? """ @@ -502,7 +500,7 @@ class PubmedImporter(EntityImporter): ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id - + ji = journal.JournalIssue volume = None if ji.find("Volume"): diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 4cd22775..c04e9aa8 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -1,8 +1,4 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client from fatcat_tools.normal import * @@ -192,4 +188,3 @@ class ShadowLibraryImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py index 53b62a37..228de134 100644 --- a/python/fatcat_tools/kafka.py +++ b/python/fatcat_tools/kafka.py @@ -1,5 +1,5 @@ -from confluent_kafka import Consumer, Producer, KafkaException +from confluent_kafka import Producer, KafkaException def kafka_fail_fast(err, msg): diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index f962ff3c..e65af8d6 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -231,4 +231,3 @@ def test_clean_orcid(): assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789" assert clean_orcid("01234567-3456-6780") == None assert clean_orcid("0x23-4567-3456-6780") == None - diff --git a/python/fatcat_tools/reviewers/review_common.py b/python/fatcat_tools/reviewers/review_common.py index ecf7da8f..336a47f6 100644 --- a/python/fatcat_tools/reviewers/review_common.py +++ b/python/fatcat_tools/reviewers/review_common.py @@ -1,12 +1,10 @@ -import json import time import datetime import subprocess from collections import Counter import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException """ checks should return: @@ -132,11 +130,12 @@ class ReviewBot: status, result_counts[status]) for result in results: if result.status == status and result.check_type == "editgroup": - comment += "\n- {description}".format(result.description) + comment += "\n- {description}".format(description=result.description) if result.status == status and result.check_type != "editgroup": - comment += "\n- {check_type} [{rev}](/{release_type}/rev/{rev}): {description}".format( + comment += "\n- {check_type} [{rev}](/{entity_type}/rev/{rev}): {description}".format( check_type=result.check_type, rev=result.rev, + entity_type=result.check_type, description=result.description) extra = self.extra.copy() diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py index 832ad6aa..ba199efb 100644 --- a/python/fatcat_tools/transforms/csl.py +++ b/python/fatcat_tools/transforms/csl.py @@ -1,6 +1,5 @@ import json -import collections from citeproc import CitationStylesStyle, CitationStylesBibliography from citeproc import Citation, CitationItem @@ -8,8 +7,6 @@ from citeproc import formatter from citeproc.source.json import CiteProcJSON from citeproc_styles import get_style_filepath -from fatcat_openapi_client import ApiClient - def contribs_by_role(contribs, role): ret = [c.copy() for c in contribs if c['role'] == role] @@ -214,14 +211,13 @@ def citeproc_csl(csl_json, style, html=False): lines = bib.bibliography()[0] if style == "bibtex": out = "" - for l in lines: - if l.startswith(" @"): + for line in lines: + if line.startswith(" @"): out += "@" - elif l.startswith(" "): - out += "\n " + l + elif line.startswith(" "): + out += "\n " + line else: - out += l + out += line return ''.join(out) else: return ''.join(lines) - diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 1d35141b..8ec9c164 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,7 +1,5 @@ -import collections import tldextract -from fatcat_openapi_client import ApiClient def check_kbart(year, archive): @@ -14,11 +12,11 @@ def check_kbart(year, archive): def test_check_kbart(): - assert check_kbart(1990, dict()) == None - assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) == False - assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) == True - assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False - assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True + assert check_kbart(1990, dict()) is None + assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) is False + assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) is True + assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) is False + assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True def release_to_elasticsearch(entity, force_bool=True): diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py index ae666413..53455e85 100644 --- a/python/fatcat_tools/transforms/entities.py +++ b/python/fatcat_tools/transforms/entities.py @@ -32,4 +32,3 @@ def entity_from_json(json_str, entity_type, api_client=None): def entity_from_dict(obj, entity_type, api_client=None): json_str = json.dumps(obj) return entity_from_json(json_str, entity_type, api_client=api_client) - diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 22b5154e..2f4e2271 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -61,4 +61,3 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= ingest_request['link_source_id'] = link_source_id return ingest_request - diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 3a49f86e..d5891ad1 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -354,4 +354,3 @@ class EntityUpdatesWorker(FatcatWorker): producer.flush() # TODO: publish updated 'work' entities to a topic consumer.store_offsets(message=msg) - diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index e58b3da1..61854c31 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -1,6 +1,5 @@ import json -import time import requests from confluent_kafka import Consumer, KafkaException @@ -138,7 +137,6 @@ class ElasticsearchReleaseWorker(FatcatWorker): consumer.store_offsets(message=msg) - class ElasticsearchContainerWorker(ElasticsearchReleaseWorker): def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, diff --git a/python/fatcat_tools/workers/worker_common.py b/python/fatcat_tools/workers/worker_common.py index ef79f528..8c2936be 100644 --- a/python/fatcat_tools/workers/worker_common.py +++ b/python/fatcat_tools/workers/worker_common.py @@ -1,15 +1,6 @@ -import re -import sys -import csv -import json -import itertools -from itertools import islice from confluent_kafka import Consumer, KafkaException, TopicPartition -import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException - def most_recent_message(topic, kafka_config): """ diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index 14595670..8e01c860 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -9,16 +9,8 @@ import sys import json import argparse -from citeproc import CitationStylesStyle, CitationStylesBibliography -from citeproc import Citation, CitationItem -from citeproc import formatter -from citeproc.source.json import CiteProcJSON -from citeproc_styles import get_style_filepath - -import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException from fatcat_openapi_client import ReleaseEntity, ContainerEntity, FileEntity, ChangelogEntry -from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ +from fatcat_tools import entity_from_json, \ release_to_elasticsearch, container_to_elasticsearch, \ file_to_elasticsearch, changelog_to_elasticsearch, public_api, \ release_to_csl, citeproc_csl diff --git a/python/fatcat_util.py b/python/fatcat_util.py index d6e76697..a45b2ba4 100755 --- a/python/fatcat_util.py +++ b/python/fatcat_util.py @@ -8,14 +8,9 @@ TODO: """ import sys -import json import argparse -import fatcat_openapi_client -from fatcat_openapi_client.rest import ApiException -from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ChangelogEntry -from fatcat_tools import uuid2fcid, fcid2uuid, entity_from_json, \ - entity_to_dict, public_api, authenticated_api +from fatcat_tools import uuid2fcid, fcid2uuid, authenticated_api def run_uuid2fcid(args): diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py index 8e26b7fe..ed9f2252 100644 --- a/python/fatcat_web/auth.py +++ b/python/fatcat_web/auth.py @@ -2,8 +2,7 @@ from collections import namedtuple import requests import pymacaroons -from flask import Flask, render_template, send_from_directory, request, \ - url_for, abort, g, redirect, jsonify, session, flash +from flask import render_template, abort, redirect, session, flash from flask_login import logout_user, login_user, UserMixin from fatcat_web import login_manager, app, api, priv_api, Config import fatcat_openapi_client @@ -141,8 +140,9 @@ def handle_wmoauth(username): # pass off "as if" we did OAuth successfully FakeOAuthRemote = namedtuple('FakeOAuthRemote', ['name', 'OAUTH_CONFIG']) remote = FakeOAuthRemote(name='wikipedia', OAUTH_CONFIG={'api_base_url': "https://www.mediawiki.org/w"}) + conservative_username = ''.join(filter(str.isalnum, username)) oauth_info = { - 'preferred_username': username, + 'preferred_username': conservative_username, 'iss': "https://www.mediawiki.org/w", 'sub': username, } diff --git a/python/fatcat_web/editing_routes.py b/python/fatcat_web/editing_routes.py index 87223868..44000b1a 100644 --- a/python/fatcat_web/editing_routes.py +++ b/python/fatcat_web/editing_routes.py @@ -1,16 +1,11 @@ -import os -import json -from flask import Flask, render_template, send_from_directory, request, \ - url_for, abort, g, redirect, jsonify, session, flash, Response +from flask import render_template, abort, redirect, session, flash from flask_login import login_required from fatcat_openapi_client import Editgroup from fatcat_openapi_client.rest import ApiException from fatcat_tools.transforms import * -from fatcat_web import app, api, auth_api, priv_api -from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth -from fatcat_web.cors import crossdomain +from fatcat_web import app, api, auth_api from fatcat_web.search import * from fatcat_web.forms import * from fatcat_web.entity_helpers import * @@ -20,7 +15,7 @@ from fatcat_web.entity_helpers import * def form_editgroup_get_or_create(api, edit_form): """ - This function expects a submitted, validated + This function expects a submitted, validated edit form """ if edit_form.editgroup_id.data: try: @@ -43,8 +38,10 @@ def form_editgroup_get_or_create(api, edit_form): app.log.warning(ae) abort(ae.status) # set this session editgroup_id - flash('Started new editgroup <a href="/editgroup/{}">{}</a>' \ - .format(eg.editgroup_id, eg.editgroup_id)) + flash('Started new editgroup <a href="/editgroup/{}">{}</a>'.format( + eg.editgroup_id, + eg.editgroup_id, + )) return eg def generic_entity_edit(editgroup_id, entity_type, existing_ident, edit_template): @@ -68,7 +65,7 @@ def generic_entity_edit(editgroup_id, entity_type, existing_ident, edit_template Helpers: - get_editgroup_revision(editgroup, entity_type, ident) -> None or entity - + TODO: prev_rev interlock """ @@ -214,7 +211,7 @@ def generic_edit_delete(editgroup_id, entity_type, edit_id): # API on behalf of user user_api = auth_api(session['api_token']) - + # do the deletion try: if entity_type == 'container': @@ -358,4 +355,3 @@ def work_editgroup_edit(editgroup_id, ident): @app.route('/editgroup/<editgroup_id>/work/edit/<edit_id>/delete', methods=['POST']) def work_edit_delete(editgroup_id, edit_id): return abort(404) - diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index 377e35aa..15585bf6 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -8,7 +8,7 @@ from flask_wtf import FlaskForm from wtforms import SelectField, DateField, StringField, IntegerField, \ HiddenField, FormField, FieldList, validators -from fatcat_openapi_client import ContainerEntity, CreatorEntity, FileEntity, \ +from fatcat_openapi_client import ContainerEntity, FileEntity, \ ReleaseEntity, ReleaseContrib, FileUrl, ReleaseExtIds release_type_options = [ @@ -293,9 +293,9 @@ class FileUrlForm(FlaskForm): default='web') class FileEntityForm(EntityEditForm): + # TODO: positive definite size = IntegerField('Size (bytes)', [validators.DataRequired()]) - # TODO: positive definite md5 = StringField("MD5", [validators.Optional(True), validators.Length(min=32, max=32)]) @@ -413,4 +413,3 @@ class SavePaperNowForm(FlaskForm): ingest_request['link_source'] = 'arxiv' ingest_request['link_source_id'] = release.ext_ids.arxiv return ingest_request - diff --git a/python/fatcat_web/graphics.py b/python/fatcat_web/graphics.py index fea7eb5a..56852627 100644 --- a/python/fatcat_web/graphics.py +++ b/python/fatcat_web/graphics.py @@ -33,4 +33,3 @@ def ia_coverage_histogram(rows): chart.add('via Fatcat', [y['available'] for y in years]) chart.add('Missing', [y['missing'] for y in years]) return chart - diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 58f4b7e0..4684f799 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -2,12 +2,12 @@ import os import sys import json -from flask import Flask, render_template, make_response, send_from_directory, \ - request, url_for, abort, g, redirect, jsonify, session, flash, Response +from flask import render_template, make_response, send_from_directory, \ + request, url_for, abort, redirect, jsonify, session, flash, Response from flask_login import login_required from flask_wtf.csrf import CSRFError -from fatcat_openapi_client import Editgroup, EditgroupAnnotation +from fatcat_openapi_client import EditgroupAnnotation from fatcat_openapi_client.rest import ApiException from fatcat_tools.transforms import * from fatcat_tools.normal import * @@ -1042,4 +1042,3 @@ def robots(): return send_from_directory(os.path.join(app.root_path, 'static'), 'robots.txt', mimetype='text/plain') - diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index c1246d22..4a87c735 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -66,7 +66,6 @@ def do_release_search(q, limit=30, fulltext_only=True, offset=0): if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: q = 'doi:"{}"'.format(q) - if fulltext_only: q += " in_web:true" @@ -297,7 +296,7 @@ def get_elastic_container_random_releases(ident, limit=5): def get_elastic_container_histogram(ident): """ - Fetches a stacked histogram of + Fetches a stacked histogram Filters to the past 500 years (at most), or about 1000 values. diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py index 0cb153d6..344f1c2a 100644 --- a/python/fatcat_web/web_config.py +++ b/python/fatcat_web/web_config.py @@ -83,4 +83,3 @@ class Config(object): 'fatcat_domain': FATCAT_DOMAIN, }, } - diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index 03167a3a..19ac16cd 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -2,7 +2,6 @@ import sys import argparse -import datetime import raven from fatcat_tools import public_api diff --git a/python/shell.py b/python/shell.py index c207a325..d53911b9 100644 --- a/python/shell.py +++ b/python/shell.py @@ -1,3 +1,4 @@ +# flake8: noqa # bunch of libraries one might want import uuid diff --git a/python/tests/api_annotations.py b/python/tests/api_annotations.py index e5566eef..0606b637 100644 --- a/python/tests/api_annotations.py +++ b/python/tests/api_annotations.py @@ -1,10 +1,5 @@ -import json -import pytest -from copy import copy - from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * diff --git a/python/tests/api_containers.py b/python/tests/api_containers.py index 0850fab7..70dbcd7e 100644 --- a/python/tests/api_containers.py +++ b/python/tests/api_containers.py @@ -1,10 +1,5 @@ -import json -import pytest -from copy import copy - from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -43,7 +38,7 @@ def test_container(api): # get redirects (none) assert api.get_container_redirects(c2.ident) == [] - + # delete eg = quick_eg(api) api.delete_container(eg.editgroup_id, c2.ident) @@ -59,4 +54,3 @@ def test_container_examples(api): c2 = api.lookup_container(issnl=c1.issnl) assert c1.ident == c2.ident - diff --git a/python/tests/api_creators.py b/python/tests/api_creators.py index 1ce6380a..b271e2b3 100644 --- a/python/tests/api_creators.py +++ b/python/tests/api_creators.py @@ -1,10 +1,5 @@ -import json -import pytest -from copy import copy - from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -52,7 +47,7 @@ def test_creators(api): assert c1.display_name == c3.display_name assert c1.extra == c3.extra - + # delete eg = quick_eg(api) api.delete_creator(eg.editgroup_id, c2.ident) diff --git a/python/tests/api_editgroups.py b/python/tests/api_editgroups.py index d82c9233..142687c2 100644 --- a/python/tests/api_editgroups.py +++ b/python/tests/api_editgroups.py @@ -1,11 +1,8 @@ -import json import pytest import datetime -from copy import copy from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * diff --git a/python/tests/api_editor.py b/python/tests/api_editor.py index 64bb2759..91881743 100644 --- a/python/tests/api_editor.py +++ b/python/tests/api_editor.py @@ -1,12 +1,5 @@ -import json -import pytest -import datetime -from copy import copy - -from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException -from fixtures import * +from fixtures import api def test_editor_update(api): diff --git a/python/tests/api_entity_editing.py b/python/tests/api_entity_editing.py index d5377e18..fee4e34f 100644 --- a/python/tests/api_entity_editing.py +++ b/python/tests/api_entity_editing.py @@ -1,10 +1,7 @@ -import json import pytest -from copy import copy from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -180,4 +177,3 @@ def test_edit_delete_all(api_dummy_entities): assert len(eg.edits.webcaptures) == 0 assert len(eg.edits.releases) == 0 assert len(eg.edits.works) == 0 - diff --git a/python/tests/api_files.py b/python/tests/api_files.py index 74865daa..65eda993 100644 --- a/python/tests/api_files.py +++ b/python/tests/api_files.py @@ -1,10 +1,5 @@ -import json -import pytest -from copy import copy - from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -50,7 +45,7 @@ def test_file(api): # get redirects (none) assert api.get_file_redirects(f2.ident) == [] - + # delete eg = quick_eg(api) api.delete_file(eg.editgroup_id, f2.ident) diff --git a/python/tests/api_filesets.py b/python/tests/api_filesets.py index 7f3235cb..6d755744 100644 --- a/python/tests/api_filesets.py +++ b/python/tests/api_filesets.py @@ -1,10 +1,7 @@ -import json import pytest -from copy import copy from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -64,7 +61,7 @@ def test_fileset(api): # get redirects (none) assert api.get_fileset_redirects(fs2.ident) == [] - + # delete eg = quick_eg(api) api.delete_fileset(eg.editgroup_id, fs2.ident) @@ -100,4 +97,3 @@ def test_bad_fileset(api): for b in bad_list: with pytest.raises(fatcat_openapi_client.rest.ApiException): api.create_fileset(eg.editgroup_id, b) - diff --git a/python/tests/api_misc.py b/python/tests/api_misc.py index 11f85fd6..4c9ac9a6 100644 --- a/python/tests/api_misc.py +++ b/python/tests/api_misc.py @@ -1,10 +1,5 @@ -import json -import pytest -from copy import copy - from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -45,4 +40,3 @@ def test_unexpected_body(api): ) f1.urls = [dict(url="http://thing", rel="repository", asdf="blue")] api.create_file(eg.editgroup_id, f1) - diff --git a/python/tests/api_releases.py b/python/tests/api_releases.py index 2df08698..c4c05ea6 100644 --- a/python/tests/api_releases.py +++ b/python/tests/api_releases.py @@ -1,11 +1,8 @@ -import json import pytest import datetime -from copy import copy from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -130,7 +127,7 @@ def test_release(api): # get redirects (none) assert api.get_release_redirects(r2.ident) == [] - + # delete eg = quick_eg(api) api.delete_release(eg.editgroup_id, r2.ident) @@ -210,4 +207,3 @@ def test_controlled_vocab(api): api.create_release(eg.editgroup_id, r3) r3.withdrawn_status = "spam" api.create_release(eg.editgroup_id, r3) - diff --git a/python/tests/api_webcaptures.py b/python/tests/api_webcaptures.py index 1054b41f..85813218 100644 --- a/python/tests/api_webcaptures.py +++ b/python/tests/api_webcaptures.py @@ -1,11 +1,8 @@ -import json import pytest import datetime -from copy import copy from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -84,7 +81,7 @@ def test_webcapture(api): # get redirects (none) assert api.get_webcapture_redirects(wc2.ident) == [] - + # delete eg = quick_eg(api) api.delete_webcapture(eg.editgroup_id, wc2.ident) diff --git a/python/tests/citation_efficiency.py b/python/tests/citation_efficiency.py index aefb7d15..f8807db6 100644 --- a/python/tests/citation_efficiency.py +++ b/python/tests/citation_efficiency.py @@ -1,10 +1,5 @@ -import json -import pytest -from copy import copy - from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -110,4 +105,3 @@ def test_citation_encoding(api): assert container == r1.refs[0].container_name assert extra == r1.refs[0].extra assert locator == r1.refs[0].locator - diff --git a/python/tests/clean_files.py b/python/tests/clean_files.py index 8a87f218..ce1102be 100644 --- a/python/tests/clean_files.py +++ b/python/tests/clean_files.py @@ -1,9 +1,10 @@ import copy import pytest + from fatcat_tools.cleanups import FileCleaner from fatcat_openapi_client import * -from fixtures import api +from fixtures import * @pytest.fixture(scope="function") diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index 78742114..44c7be63 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -1,8 +1,4 @@ -import os -import time -import json -import signal import pytest from dotenv import load_dotenv import fatcat_web @@ -87,4 +83,3 @@ def test_get_changelog_entry(api): def quick_eg(api_inst): eg = api_inst.create_editgroup(fatcat_openapi_client.Editgroup()) return eg - diff --git a/python/tests/harvest_crossref.py b/python/tests/harvest_crossref.py index e902cda5..cad0f03b 100644 --- a/python/tests/harvest_crossref.py +++ b/python/tests/harvest_crossref.py @@ -1,6 +1,5 @@ import json -import pytest import datetime import responses from fatcat_tools.harvest import * diff --git a/python/tests/harvest_datacite.py b/python/tests/harvest_datacite.py index 004d1fef..13c6042a 100644 --- a/python/tests/harvest_datacite.py +++ b/python/tests/harvest_datacite.py @@ -1,6 +1,5 @@ import json -import pytest import datetime import responses from fatcat_tools.harvest import * diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py index f8db46b6..58bc4226 100644 --- a/python/tests/harvest_pubmed.py +++ b/python/tests/harvest_pubmed.py @@ -2,14 +2,11 @@ Test pubmed FTP harvest. """ -import datetime -import json import os - +import datetime import pytest from fatcat_tools.harvest import * -from fatcat_tools.harvest.pubmed import generate_date_file_map def test_pubmed_harvest_date(mocker): @@ -77,4 +74,3 @@ def test_pubmed_harvest_date_no_pmid(mocker): # The file has not PMID, not importable. with pytest.raises(ValueError): harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) - diff --git a/python/tests/harvest_state.py b/python/tests/harvest_state.py index 8b7deba6..cc624d97 100644 --- a/python/tests/harvest_state.py +++ b/python/tests/harvest_state.py @@ -1,6 +1,4 @@ -import json -import pytest import datetime from fatcat_tools.harvest import * diff --git a/python/tests/import_arabesque.py b/python/tests/import_arabesque.py index 9483eb45..20cde3a6 100644 --- a/python/tests/import_arabesque.py +++ b/python/tests/import_arabesque.py @@ -1,8 +1,9 @@ import json import pytest + from fatcat_tools.importers import ArabesqueMatchImporter, SqlitePusher, JsonLinePusher -from fixtures import api +from fixtures import * @pytest.fixture(scope="function") diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py index 1e649616..9306e67c 100644 --- a/python/tests/import_arxiv.py +++ b/python/tests/import_arxiv.py @@ -1,10 +1,10 @@ -import json, gzip import pytest -from fatcat_tools.importers import ArxivRawImporter, Bs4XmlFilePusher -from fixtures import api from bs4 import BeautifulSoup +from fatcat_tools.importers import ArxivRawImporter, Bs4XmlFilePusher +from fixtures import * + @pytest.fixture(scope="function") def arxiv_importer(api): diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index afa2410f..65cd2c37 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -1,8 +1,10 @@ -import json, gzip +import json +import gzip import pytest + from fatcat_tools.importers import CrossrefImporter, JsonLinePusher -from fixtures import api +from fixtures import * @pytest.fixture(scope="function") diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index b01a11e6..b94b6bc5 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -2,10 +2,13 @@ Test datacite importer. """ -import collections +import gzip +import json import datetime +import collections + import pytest -import gzip + from fatcat_tools.importers import DataciteImporter, JsonLinePusher from fatcat_tools.importers.datacite import ( find_original_language_title, diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py index 51ab3faa..52284b89 100644 --- a/python/tests/import_grobid_metadata.py +++ b/python/tests/import_grobid_metadata.py @@ -3,8 +3,9 @@ import os import json import base64 import pytest + from fatcat_tools.importers import GrobidMetadataImporter, LinePusher -from fixtures import api +from fixtures import * """ WARNING: these tests are currently very fragile because they have database diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 02486de6..ebe2923c 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -1,6 +1,7 @@ import json import pytest + from fatcat_tools.importers import IngestFileResultImporter, JsonLinePusher from fixtures import * diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py index f61ec849..ff757e51 100644 --- a/python/tests/import_jalc.py +++ b/python/tests/import_jalc.py @@ -1,10 +1,10 @@ -import json, gzip import pytest -from fatcat_tools.importers import JalcImporter, Bs4XmlFilePusher, Bs4XmlLinesPusher -from fixtures import api from bs4 import BeautifulSoup +from fatcat_tools.importers import JalcImporter, Bs4XmlFilePusher, Bs4XmlLinesPusher +from fixtures import * + @pytest.fixture(scope="function") def jalc_importer(api): diff --git a/python/tests/import_journal_metadata.py b/python/tests/import_journal_metadata.py index cfeee517..51b0a78a 100644 --- a/python/tests/import_journal_metadata.py +++ b/python/tests/import_journal_metadata.py @@ -1,7 +1,8 @@ import pytest + from fatcat_tools.importers import JournalMetadataImporter, JsonLinePusher -from fixtures import api +from fixtures import * @pytest.fixture(scope="function") diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py index 019f0aae..8494ffb2 100644 --- a/python/tests/import_jstor.py +++ b/python/tests/import_jstor.py @@ -1,10 +1,10 @@ -import json, gzip import pytest -from fatcat_tools.importers import JstorImporter, Bs4XmlFilePusher -from fixtures import api from bs4 import BeautifulSoup +from fatcat_tools.importers import JstorImporter, Bs4XmlFilePusher +from fixtures import * + @pytest.fixture(scope="function") def jstor_importer(api): diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py index 72ed068c..6b61c53c 100644 --- a/python/tests/import_matched.py +++ b/python/tests/import_matched.py @@ -1,8 +1,9 @@ import json import pytest + from fatcat_tools.importers import MatchedImporter, JsonLinePusher -from fixtures import api +from fixtures import * @pytest.fixture(scope="function") diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py index 57886b52..f78ccde7 100644 --- a/python/tests/import_orcid.py +++ b/python/tests/import_orcid.py @@ -1,8 +1,9 @@ import json import pytest + from fatcat_tools.importers import OrcidImporter, JsonLinePusher -from fixtures import api +from fixtures import * @pytest.fixture(scope="function") diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index f57aa273..201f533c 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -1,10 +1,10 @@ -import json, gzip import pytest -from fatcat_tools.importers import PubmedImporter, Bs4XmlLargeFilePusher -from fixtures import api from bs4 import BeautifulSoup +from fatcat_tools.importers import PubmedImporter, Bs4XmlLargeFilePusher +from fixtures import * + @pytest.fixture(scope="function") def pubmed_importer(api): @@ -137,4 +137,3 @@ def test_pubmed_xml_parse_refs(pubmed_importer): r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0]) assert len(r1.refs) > 1 - diff --git a/python/tests/import_shadow.py b/python/tests/import_shadow.py index 70a918d2..40a1d589 100644 --- a/python/tests/import_shadow.py +++ b/python/tests/import_shadow.py @@ -1,8 +1,9 @@ import json import pytest + from fatcat_tools.importers import ShadowLibraryImporter, JsonLinePusher -from fixtures import api +from fixtures import * @pytest.fixture(scope="function") @@ -58,4 +59,3 @@ def test_shadow_dict_parse(shadow_importer): assert u.url.startswith("https://web.archive.org/") assert "20180729135948" in u.url assert len(f.release_ids) == 1 - diff --git a/python/tests/importer.py b/python/tests/importer.py index 9308ba84..a412b247 100644 --- a/python/tests/importer.py +++ b/python/tests/importer.py @@ -1,8 +1,6 @@ - -import pytest from fatcat_tools.importers import CrossrefImporter, OrcidImporter -from fixtures import api +from fixtures import * def test_issnl_mapping_lookup(api): @@ -32,4 +30,3 @@ def test_identifiers(api): assert oi.is_orcid("0000-00x3-3118-659") == False assert oi.is_orcid("0000-00033118-659") == False assert oi.is_orcid("0000-0003-3118-659.") == False - diff --git a/python/tests/subentity_state.py b/python/tests/subentity_state.py index 614f88f1..e03fa99e 100644 --- a/python/tests/subentity_state.py +++ b/python/tests/subentity_state.py @@ -1,10 +1,5 @@ -import json -import pytest -from copy import copy - from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fixtures import * """ @@ -221,4 +216,3 @@ def test_app_entity_states(api, app): assert rv.status_code == 200 rv = app.get('/work/{}'.format(r2.work_id)) assert rv.status_code == 302 - diff --git a/python/tests/tools_api.py b/python/tests/tools_api.py index fd26b8ee..a4b5f2ea 100644 --- a/python/tests/tools_api.py +++ b/python/tests/tools_api.py @@ -1,6 +1,5 @@ import pytest -from fatcat_openapi_client import EditgroupAnnotation from fatcat_openapi_client.rest import ApiException from fatcat_tools import public_api, authenticated_api diff --git a/python/tests/transform_csl.py b/python/tests/transform_csl.py index 15c64ce5..6436f876 100644 --- a/python/tests/transform_csl.py +++ b/python/tests/transform_csl.py @@ -1,11 +1,11 @@ import json import pytest + from fatcat_tools import * from fatcat_openapi_client import * - -from fixtures import api from import_crossref import crossref_importer +from fixtures import * def test_csl_crossref(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index a954fc4d..f791562c 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -1,13 +1,13 @@ import json -import pytest + from fatcat_tools import * from fatcat_openapi_client import * -from fixtures import api from import_journal_metadata import journal_metadata_importer - from import_crossref import crossref_importer from import_matched import matched_importer +from fixtures import * + def test_basic_elasticsearch_convert(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: diff --git a/python/tests/transform_ingest.py b/python/tests/transform_ingest.py index 2d5652b8..c7044bc0 100644 --- a/python/tests/transform_ingest.py +++ b/python/tests/transform_ingest.py @@ -1,12 +1,12 @@ import json -import pytest + from fatcat_tools.transforms import release_ingest_request from fatcat_openapi_client import * from fixtures import api - from import_crossref import crossref_importer + def test_basic_ingest_release(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: # not a single line @@ -54,4 +54,3 @@ def test_rich_ingest_release(): assert ir['base_url'] == 'https://doi.org/10.123/456' assert ir['ext_ids']['doi'] == '10.123/456' assert ir['ext_ids'].get('pmcid') is None - diff --git a/python/tests/web_auth.py b/python/tests/web_auth.py index 2c545b6b..643d806e 100644 --- a/python/tests/web_auth.py +++ b/python/tests/web_auth.py @@ -1,8 +1,5 @@ -import json -import pytest import responses -from fatcat_openapi_client.rest import ApiException from fixtures import * diff --git a/python/tests/web_citation_csl.py b/python/tests/web_citation_csl.py index e016b2d9..fb3ce58d 100644 --- a/python/tests/web_citation_csl.py +++ b/python/tests/web_citation_csl.py @@ -1,8 +1,6 @@ import json -import tempfile import pytest -from fatcat_openapi_client.rest import ApiException from fixtures import * diff --git a/python/tests/web_editgroup.py b/python/tests/web_editgroup.py index cbdd2176..20dc8d93 100644 --- a/python/tests/web_editgroup.py +++ b/python/tests/web_editgroup.py @@ -1,9 +1,7 @@ -import json -import pytest -from fatcat_openapi_client.rest import ApiException from fixtures import * + def test_editgroup_basics(app): rv = app.get('/editgroup/aaaaaaaaaaaabo53aaaaaaaaae') @@ -59,4 +57,3 @@ def test_editgroup_annotations_login(app_admin): assert rv.status_code == 200 assert b'Signup' not in rv.data assert b'Add Comment' in rv.data - diff --git a/python/tests/web_editing.py b/python/tests/web_editing.py index 773a59dd..17f4f5ae 100644 --- a/python/tests/web_editing.py +++ b/python/tests/web_editing.py @@ -1,7 +1,4 @@ -import json -import pytest -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -147,4 +144,3 @@ def test_web_edit_get(app_admin): rv = app_admin.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/edit') assert rv.status_code == 200 assert b'1549-1277' in rv.data - diff --git a/python/tests/web_editor.py b/python/tests/web_editor.py index 2614be96..58b21ddf 100644 --- a/python/tests/web_editor.py +++ b/python/tests/web_editor.py @@ -1,7 +1,4 @@ -import json -import pytest -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -25,4 +22,3 @@ def test_change_username(app_admin): assert rv.status_code == 200 rv = app_admin.get('/auth/account') assert b'admin-tmp' not in rv.data - diff --git a/python/tests/web_entity_views.py b/python/tests/web_entity_views.py index a3f0f897..c1cbdc29 100644 --- a/python/tests/web_entity_views.py +++ b/python/tests/web_entity_views.py @@ -1,7 +1,4 @@ -import json -import pytest -from fatcat_openapi_client.rest import ApiException from fixtures import * from fatcat_web.forms import ReleaseEntityForm, FileEntityForm, ContainerEntityForm @@ -367,4 +364,3 @@ def test_web_work(app): assert rv.status_code == 404 rv = app.get('/work/create') assert rv.status_code == 404 - diff --git a/python/tests/web_routes.py b/python/tests/web_routes.py index 026776ee..0edf06d1 100644 --- a/python/tests/web_routes.py +++ b/python/tests/web_routes.py @@ -1,7 +1,4 @@ -import json -import pytest -from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -13,4 +10,3 @@ def test_static_routes(app): assert app.get("/search").status_code == 302 assert app.get("/static/bogus/route").status_code == 404 - diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 24b817dc..7647bcf5 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -1,8 +1,7 @@ import json -import pytest import responses -from fatcat_openapi_client.rest import ApiException + from fixtures import * @responses.activate diff --git a/rust/Makefile b/rust/Makefile index 0abd739e..81fb32af 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -6,7 +6,7 @@ SHELL = /bin/bash help: ## Print info about all commands @echo "Commands:" @echo - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' .PHONY: test test: ## Run all tests and lints |