From 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 12:22:38 -0700 Subject: python: isort all imports --- python/scripts/arabesque2ingestrequest.py | 4 ++-- python/scripts/archiveorg_fileset.py | 3 +-- python/scripts/cdx_collection.py | 8 +++++--- python/scripts/covid2ingestrequest.py | 5 +++-- python/scripts/deliver_dumpgrobid_to_s3.py | 8 ++++---- python/scripts/deliver_gwb_to_disk.py | 12 ++++++------ python/scripts/deliver_gwb_to_s3.py | 12 ++++++------ python/scripts/doaj2ingestrequest.py | 7 ++++--- python/scripts/enrich_scored_matches.py | 5 +++-- python/scripts/filter_grobid_metadata.py | 2 +- python/scripts/filter_groupworks.py | 2 +- python/scripts/filter_scored_matches.py | 2 +- python/scripts/grobid_affiliations.py | 3 ++- python/scripts/import_grobid_metadata.py | 4 ++-- python/scripts/ingestrequest_row2json.py | 4 ++-- python/scripts/manifest_converter.py | 2 +- python/scripts/oai2ingestrequest.py | 5 +++-- python/scripts/pdf_thumbnail.py | 1 + python/scripts/unpaywall2ingestrequest.py | 5 +++-- 19 files changed, 51 insertions(+), 43 deletions(-) (limited to 'python/scripts') diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index 03a1f29..69fe320 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -12,9 +12,9 @@ Run like: Can then run through requests using that tool, or dump into kafka queue. """ -import sys -import json import argparse +import json +import sys def run(args): diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py index 0e507eb..86ca062 100755 --- a/python/scripts/archiveorg_fileset.py +++ b/python/scripts/archiveorg_fileset.py @@ -9,13 +9,12 @@ TODO: - should this check the item type? """ -import sys import json +import sys from typing import Any import internetarchive - FORMAT_TO_MIMETYPE = { 'BZIP': 'application/x-bzip', 'BZIP2': 'application/x-bzip2', diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py index e867b21..5e33def 100755 --- a/python/scripts/cdx_collection.py +++ b/python/scripts/cdx_collection.py @@ -11,12 +11,14 @@ Call with a collection name: """ import os -import sys import shutil -import tempfile -import requests import subprocess +import sys +import tempfile + import internetarchive as ia +import requests + def run(): diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py index 33c425d..1b7c85c 100755 --- a/python/scripts/covid2ingestrequest.py +++ b/python/scripts/covid2ingestrequest.py @@ -4,9 +4,10 @@ Transform an unpaywall dump (JSON) into ingest requests. """ -import sys -import json import argparse +import json +import sys + import urlcanon diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 86b3b35..62a85e6 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -23,12 +23,12 @@ Requires: - boto3 (AWS S3 client library) """ -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter import boto3 diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py index 3dcf962..ab1906a 100755 --- a/python/scripts/deliver_gwb_to_disk.py +++ b/python/scripts/deliver_gwb_to_disk.py @@ -7,19 +7,19 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk. # in `wayback` library. Means we can't run pylint. # pylint: skip-file -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter +from http.client import IncompleteRead import raven import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory +from wayback.resourcestore import ResourceStore # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable sentry_client = raven.Client() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index 39ac000..f103205 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -33,20 +33,20 @@ Requires: # in `wayback` library. Means we can't run pylint. # pylint: skip-file -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter +from http.client import IncompleteRead import boto3 import raven import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory +from wayback.resourcestore import ResourceStore # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable sentry_client = raven.Client() diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index a7214d0..15b30a0 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -9,11 +9,12 @@ in the HTML headers and adds an ingest request on that basis. Or even just run the re-ingest in-process and publish a second result. """ -import sys -import json import argparse +import json +import sys +from typing import List, Optional + import urlcanon -from typing import Optional, List DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py index 9fe1499..3085346 100755 --- a/python/scripts/enrich_scored_matches.py +++ b/python/scripts/enrich_scored_matches.py @@ -17,9 +17,10 @@ And outputs JSON objects that are can be imported into fatcat with the No dependencies (only python3 stdlib) """ -import sys -import json import base64 +import json +import sys + def run(): for line in sys.stdin: diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index dc4bea7..d0666ce 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -import sys import json +import sys with open('title_slug_denylist.txt', 'r') as f: TITLE_DENYLIST = [l.strip() for l in f] diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py index bbba770..494da71 100755 --- a/python/scripts/filter_groupworks.py +++ b/python/scripts/filter_groupworks.py @@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out: - dates differ (not just year) """ -import sys import json +import sys # out of 1000 SCORE_THRESHOLD = 900 diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index 3654b87..abf81bd 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file). No dependencies (only python3 stdlib) """ -import sys import json +import sys # out of 1000 score_threshold = 900 diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index 79feac1..d391f60 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -10,11 +10,12 @@ Run in bulk like: ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations' """ -import sys import json +import sys from grobid2json import teixml2json + def parse_hbase(line): line = line.split('\t') assert len(line) == 2 diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py index d01b526..8aee0be 100755 --- a/python/scripts/import_grobid_metadata.py +++ b/python/scripts/import_grobid_metadata.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -import sys -import json import datetime +import json +import sys MAX_ABSTRACT_BYTES=4096 diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 494ec7a..acba2a8 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -7,9 +7,9 @@ format) back in to regular ingest request JSON. The only difference is the name and location of some optional keys. """ -import sys -import json import argparse +import json +import sys def transform(row): diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py index 35cee5b..8267003 100755 --- a/python/scripts/manifest_converter.py +++ b/python/scripts/manifest_converter.py @@ -10,9 +10,9 @@ This was used to convert this manifest: to JSON format for fast fatcat importing. """ -import sys import json import sqlite3 +import sys # iterate over rows in files metadata... # 1. select all identified DOIs diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 916f41c..315b8d2 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -6,9 +6,10 @@ Transform an OAI-PMH bulk dump (JSON) into ingest requests. Eg: https://archive.org/details/oai_harvest_20200215 """ -import sys -import json import argparse +import json +import sys + import urlcanon DOMAIN_BLOCKLIST = [ diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py index af08db6..71fbe54 100755 --- a/python/scripts/pdf_thumbnail.py +++ b/python/scripts/pdf_thumbnail.py @@ -7,6 +7,7 @@ Originally used to benchmark and compare file size/quality. """ import sys + import poppler from PIL import Image diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 5536e6c..590b429 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -4,9 +4,10 @@ Transform an unpaywall dump (JSON) into ingest requests. """ -import sys -import json import argparse +import json +import sys + import urlcanon DOMAIN_BLOCKLIST = [ -- cgit v1.2.3