aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py4
-rwxr-xr-xpython/scripts/archiveorg_fileset.py3
-rwxr-xr-xpython/scripts/cdx_collection.py8
-rwxr-xr-xpython/scripts/covid2ingestrequest.py5
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py8
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py12
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py12
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py7
-rwxr-xr-xpython/scripts/enrich_scored_matches.py5
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py2
-rwxr-xr-xpython/scripts/filter_groupworks.py2
-rwxr-xr-xpython/scripts/filter_scored_matches.py2
-rwxr-xr-xpython/scripts/grobid_affiliations.py3
-rwxr-xr-xpython/scripts/import_grobid_metadata.py4
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py4
-rwxr-xr-xpython/scripts/manifest_converter.py2
-rwxr-xr-xpython/scripts/oai2ingestrequest.py5
-rwxr-xr-xpython/scripts/pdf_thumbnail.py1
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py5
19 files changed, 51 insertions, 43 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 03a1f29..69fe320 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -12,9 +12,9 @@ Run like:
Can then run through requests using that tool, or dump into kafka queue.
"""
-import sys
-import json
import argparse
+import json
+import sys
def run(args):
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 0e507eb..86ca062 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -9,13 +9,12 @@ TODO:
- should this check the item type?
"""
-import sys
import json
+import sys
from typing import Any
import internetarchive
-
FORMAT_TO_MIMETYPE = {
'BZIP': 'application/x-bzip',
'BZIP2': 'application/x-bzip2',
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
index e867b21..5e33def 100755
--- a/python/scripts/cdx_collection.py
+++ b/python/scripts/cdx_collection.py
@@ -11,12 +11,14 @@ Call with a collection name:
"""
import os
-import sys
import shutil
-import tempfile
-import requests
import subprocess
+import sys
+import tempfile
+
import internetarchive as ia
+import requests
+
def run():
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 33c425d..1b7c85c 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -4,9 +4,10 @@
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..62a85e6 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -23,12 +23,12 @@ Requires:
- boto3 (AWS S3 client library)
"""
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
import boto3
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index 3dcf962..ab1906a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -7,19 +7,19 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
import raven
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
+from wayback.resourcestore import ResourceStore
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
sentry_client = raven.Client()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 39ac000..f103205 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -33,20 +33,20 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
import boto3
import raven
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
+from wayback.resourcestore import ResourceStore
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
sentry_client = raven.Client()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index a7214d0..15b30a0 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -9,11 +9,12 @@ in the HTML headers and adds an ingest request on that basis. Or even just run
the re-ingest in-process and publish a second result.
"""
-import sys
-import json
import argparse
+import json
+import sys
+from typing import List, Optional
+
import urlcanon
-from typing import Optional, List
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 9fe1499..3085346 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -17,9 +17,10 @@ And outputs JSON objects that are can be imported into fatcat with the
No dependencies (only python3 stdlib)
"""
-import sys
-import json
import base64
+import json
+import sys
+
def run():
for line in sys.stdin:
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index dc4bea7..d0666ce 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
-import sys
import json
+import sys
with open('title_slug_denylist.txt', 'r') as f:
TITLE_DENYLIST = [l.strip() for l in f]
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..494da71 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
- dates differ (not just year)
"""
-import sys
import json
+import sys
# out of 1000
SCORE_THRESHOLD = 900
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3654b87..abf81bd 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).
No dependencies (only python3 stdlib)
"""
-import sys
import json
+import sys
# out of 1000
score_threshold = 900
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 79feac1..d391f60 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -10,11 +10,12 @@ Run in bulk like:
ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
"""
-import sys
import json
+import sys
from grobid2json import teixml2json
+
def parse_hbase(line):
line = line.split('\t')
assert len(line) == 2
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index d01b526..8aee0be 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
-import sys
-import json
import datetime
+import json
+import sys
MAX_ABSTRACT_BYTES=4096
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 494ec7a..acba2a8 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -7,9 +7,9 @@ format) back in to regular ingest request JSON.
The only difference is the name and location of some optional keys.
"""
-import sys
-import json
import argparse
+import json
+import sys
def transform(row):
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 35cee5b..8267003 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -10,9 +10,9 @@ This was used to convert this manifest:
to JSON format for fast fatcat importing.
"""
-import sys
import json
import sqlite3
+import sys
# iterate over rows in files metadata...
# 1. select all identified DOIs
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..315b8d2 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -6,9 +6,10 @@ Transform an OAI-PMH bulk dump (JSON) into ingest requests.
Eg: https://archive.org/details/oai_harvest_20200215
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index af08db6..71fbe54 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -7,6 +7,7 @@ Originally used to benchmark and compare file size/quality.
"""
import sys
+
import poppler
from PIL import Image
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 5536e6c..590b429 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -4,9 +4,10 @@
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [