diff options
Diffstat (limited to 'python/scripts')
| -rwxr-xr-x | python/scripts/arabesque2ingestrequest.py | 4 | ||||
| -rwxr-xr-x | python/scripts/archiveorg_fileset.py | 3 | ||||
| -rwxr-xr-x | python/scripts/cdx_collection.py | 8 | ||||
| -rwxr-xr-x | python/scripts/covid2ingestrequest.py | 5 | ||||
| -rwxr-xr-x | python/scripts/deliver_dumpgrobid_to_s3.py | 8 | ||||
| -rwxr-xr-x | python/scripts/deliver_gwb_to_disk.py | 12 | ||||
| -rwxr-xr-x | python/scripts/deliver_gwb_to_s3.py | 12 | ||||
| -rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 7 | ||||
| -rwxr-xr-x | python/scripts/enrich_scored_matches.py | 5 | ||||
| -rwxr-xr-x | python/scripts/filter_grobid_metadata.py | 2 | ||||
| -rwxr-xr-x | python/scripts/filter_groupworks.py | 2 | ||||
| -rwxr-xr-x | python/scripts/filter_scored_matches.py | 2 | ||||
| -rwxr-xr-x | python/scripts/grobid_affiliations.py | 3 | ||||
| -rwxr-xr-x | python/scripts/import_grobid_metadata.py | 4 | ||||
| -rwxr-xr-x | python/scripts/ingestrequest_row2json.py | 4 | ||||
| -rwxr-xr-x | python/scripts/manifest_converter.py | 2 | ||||
| -rwxr-xr-x | python/scripts/oai2ingestrequest.py | 5 | ||||
| -rwxr-xr-x | python/scripts/pdf_thumbnail.py | 1 | ||||
| -rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 5 | 
19 files changed, 51 insertions, 43 deletions
| diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index 03a1f29..69fe320 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -12,9 +12,9 @@ Run like:  Can then run through requests using that tool, or dump into kafka queue.  """ -import sys -import json  import argparse +import json +import sys  def run(args): diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py index 0e507eb..86ca062 100755 --- a/python/scripts/archiveorg_fileset.py +++ b/python/scripts/archiveorg_fileset.py @@ -9,13 +9,12 @@ TODO:  - should this check the item type?  """ -import sys  import json +import sys  from typing import Any  import internetarchive -  FORMAT_TO_MIMETYPE = {      'BZIP': 'application/x-bzip',      'BZIP2': 'application/x-bzip2', diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py index e867b21..5e33def 100755 --- a/python/scripts/cdx_collection.py +++ b/python/scripts/cdx_collection.py @@ -11,12 +11,14 @@ Call with a collection name:  """  import os -import sys  import shutil -import tempfile -import requests  import subprocess +import sys +import tempfile +  import internetarchive as ia +import requests +  def run(): diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py index 33c425d..1b7c85c 100755 --- a/python/scripts/covid2ingestrequest.py +++ b/python/scripts/covid2ingestrequest.py @@ -4,9 +4,10 @@  Transform an unpaywall dump (JSON) into ingest requests.  """ -import sys -import json  import argparse +import json +import sys +  import urlcanon diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 86b3b35..62a85e6 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -23,12 +23,12 @@ Requires:  - boto3 (AWS S3 client library)  """ -import os -import sys -import json +import argparse  import base64  import hashlib -import argparse +import json +import os +import sys  from collections import Counter  import boto3 diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py index 3dcf962..ab1906a 100755 --- a/python/scripts/deliver_gwb_to_disk.py +++ b/python/scripts/deliver_gwb_to_disk.py @@ -7,19 +7,19 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.  # in `wayback` library. Means we can't run pylint.  # pylint: skip-file -import os -import sys -import json +import argparse  import base64  import hashlib -import argparse +import json +import os +import sys  from collections import Counter +from http.client import IncompleteRead  import raven  import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore  from gwb.loader import CDXLoaderFactory +from wayback.resourcestore import ResourceStore  # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable  sentry_client = raven.Client() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index 39ac000..f103205 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -33,20 +33,20 @@ Requires:  # in `wayback` library. Means we can't run pylint.  # pylint: skip-file -import os -import sys -import json +import argparse  import base64  import hashlib -import argparse +import json +import os +import sys  from collections import Counter +from http.client import IncompleteRead  import boto3  import raven  import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore  from gwb.loader import CDXLoaderFactory +from wayback.resourcestore import ResourceStore  # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable  sentry_client = raven.Client() diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index a7214d0..15b30a0 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -9,11 +9,12 @@ in the HTML headers and adds an ingest request on that basis. Or even just run  the re-ingest in-process and publish a second result.  """ -import sys -import json  import argparse +import json +import sys +from typing import List, Optional +  import urlcanon -from typing import Optional, List  DOMAIN_BLOCKLIST = [      # large OA publishers (we get via DOI) diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py index 9fe1499..3085346 100755 --- a/python/scripts/enrich_scored_matches.py +++ b/python/scripts/enrich_scored_matches.py @@ -17,9 +17,10 @@ And outputs JSON objects that are can be imported into fatcat with the  No dependencies (only python3 stdlib)  """ -import sys -import json  import base64 +import json +import sys +  def run():      for line in sys.stdin: diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index dc4bea7..d0666ce 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -1,7 +1,7 @@  #!/usr/bin/env python3 -import sys  import json +import sys  with open('title_slug_denylist.txt', 'r') as f:      TITLE_DENYLIST = [l.strip() for l in f] diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py index bbba770..494da71 100755 --- a/python/scripts/filter_groupworks.py +++ b/python/scripts/filter_groupworks.py @@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:  - dates differ (not just year)  """ -import sys  import json +import sys  # out of 1000  SCORE_THRESHOLD = 900 diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index 3654b87..abf81bd 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).  No dependencies (only python3 stdlib)  """ -import sys  import json +import sys  # out of 1000  score_threshold = 900 diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index 79feac1..d391f60 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -10,11 +10,12 @@ Run in bulk like:      ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'  """ -import sys  import json +import sys  from grobid2json import teixml2json +  def parse_hbase(line):      line = line.split('\t')      assert len(line) == 2 diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py index d01b526..8aee0be 100755 --- a/python/scripts/import_grobid_metadata.py +++ b/python/scripts/import_grobid_metadata.py @@ -1,8 +1,8 @@  #!/usr/bin/env python3 -import sys -import json  import datetime +import json +import sys  MAX_ABSTRACT_BYTES=4096 diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 494ec7a..acba2a8 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -7,9 +7,9 @@ format) back in to regular ingest request JSON.  The only difference is the name and location of some optional keys.  """ -import sys -import json  import argparse +import json +import sys  def transform(row): diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py index 35cee5b..8267003 100755 --- a/python/scripts/manifest_converter.py +++ b/python/scripts/manifest_converter.py @@ -10,9 +10,9 @@ This was used to convert this manifest:  to JSON format for fast fatcat importing.  """ -import sys  import json  import sqlite3 +import sys  # iterate over rows in files metadata...  # 1. select all identified DOIs diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 916f41c..315b8d2 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -6,9 +6,10 @@ Transform an OAI-PMH bulk dump (JSON) into ingest requests.  Eg: https://archive.org/details/oai_harvest_20200215  """ -import sys -import json  import argparse +import json +import sys +  import urlcanon  DOMAIN_BLOCKLIST = [ diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py index af08db6..71fbe54 100755 --- a/python/scripts/pdf_thumbnail.py +++ b/python/scripts/pdf_thumbnail.py @@ -7,6 +7,7 @@ Originally used to benchmark and compare file size/quality.  """  import sys +  import poppler  from PIL import Image diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 5536e6c..590b429 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -4,9 +4,10 @@  Transform an unpaywall dump (JSON) into ingest requests.  """ -import sys -import json  import argparse +import json +import sys +  import urlcanon  DOMAIN_BLOCKLIST = [ | 
