aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:22:38 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:22:38 -0700
commit3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch)
treeb7e7e27ff2032c99fd782b3ea40daf1d12f9164e /python
parentf67d870ba4ca9cecd0b75f106335997c813e9df4 (diff)
downloadsandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz
sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip
python: isort all imports
Diffstat (limited to 'python')
-rwxr-xr-xpython/grobid2json.py4
-rwxr-xr-xpython/grobid_tool.py4
-rwxr-xr-xpython/ia_pdf_match.py3
-rwxr-xr-xpython/ingest_tool.py6
-rwxr-xr-xpython/pdfextract_tool.py4
-rwxr-xr-xpython/pdftrio_tool.py4
-rwxr-xr-xpython/persist_tool.py2
-rw-r--r--python/sandcrawler/__init__.py19
-rw-r--r--python/sandcrawler/db.py3
-rw-r--r--python/sandcrawler/fileset_platforms.py10
-rw-r--r--python/sandcrawler/fileset_strategies.py13
-rw-r--r--python/sandcrawler/fileset_types.py3
-rw-r--r--python/sandcrawler/grobid.py4
-rw-r--r--python/sandcrawler/html.py2
-rw-r--r--python/sandcrawler/html_metadata.py9
-rw-r--r--python/sandcrawler/ia.py22
-rw-r--r--python/sandcrawler/ingest_file.py26
-rw-r--r--python/sandcrawler/ingest_fileset.py31
-rw-r--r--python/sandcrawler/ingest_html.py18
-rw-r--r--python/sandcrawler/minio.py2
-rw-r--r--python/sandcrawler/misc.py10
-rw-r--r--python/sandcrawler/pdfextract.py11
-rw-r--r--python/sandcrawler/pdftrio.py3
-rw-r--r--python/sandcrawler/persist.py8
-rw-r--r--python/sandcrawler/workers.py13
-rwxr-xr-xpython/sandcrawler_worker.py7
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py4
-rwxr-xr-xpython/scripts/archiveorg_fileset.py3
-rwxr-xr-xpython/scripts/cdx_collection.py8
-rwxr-xr-xpython/scripts/covid2ingestrequest.py5
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py8
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py12
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py12
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py7
-rwxr-xr-xpython/scripts/enrich_scored_matches.py5
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py2
-rwxr-xr-xpython/scripts/filter_groupworks.py2
-rwxr-xr-xpython/scripts/filter_scored_matches.py2
-rwxr-xr-xpython/scripts/grobid_affiliations.py3
-rwxr-xr-xpython/scripts/import_grobid_metadata.py4
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py4
-rwxr-xr-xpython/scripts/manifest_converter.py2
-rwxr-xr-xpython/scripts/oai2ingestrequest.py5
-rwxr-xr-xpython/scripts/pdf_thumbnail.py1
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py5
-rw-r--r--python/tests/test_grobid.py8
-rw-r--r--python/tests/test_grobid2json.py4
-rw-r--r--python/tests/test_html.py2
-rw-r--r--python/tests/test_html_ingest.py1
-rw-r--r--python/tests/test_html_metadata.py1
-rw-r--r--python/tests/test_ingest.py7
-rw-r--r--python/tests/test_live_wayback.py4
-rw-r--r--python/tests/test_misc.py3
-rw-r--r--python/tests/test_pdfextract.py10
-rw-r--r--python/tests/test_pushers.py2
-rw-r--r--python/tests/test_savepagenow.py4
-rw-r--r--python/tests/test_wayback.py4
57 files changed, 207 insertions, 178 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index a22d47d..b4bfe2b 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -23,11 +23,11 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
Prints JSON to stdout, errors to stderr
"""
+import argparse
import io
import json
-import argparse
import xml.etree.ElementTree as ET
-from typing import List, Any, Dict, AnyStr, Optional
+from typing import Any, AnyStr, Dict, List, Optional
xml_ns = "http://www.w3.org/XML/1998/namespace"
ns = "http://www.tei-c.org/ns/1.0"
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 2a1d8b5..0084330 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -9,10 +9,10 @@ Example of large parallel run, locally:
cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
"""
-import sys
-import json
import argparse
import datetime
+import json
+import sys
from grobid2json import teixml2json
from sandcrawler import *
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index 20c65bb..137110c 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -22,8 +22,9 @@ When invoking import matched, be sure to:
--default-mimetype application/pdf
"""
-import sys
import json
+import sys
+
def parse(obj):
if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index fdb5b48..c0ef5aa 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -1,10 +1,10 @@
#!/usr/bin/env python3
-import sys
-import json
import argparse
-
+import json
+import sys
from http.server import HTTPServer
+
from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
from sandcrawler.ingest_fileset import IngestFilesetWorker
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index 10a0f48..89ecf1c 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -4,10 +4,10 @@
KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
"""
-import sys
-import json
import argparse
import datetime
+import json
+import sys
from grobid2json import teixml2json
from sandcrawler import *
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 5cffa8c..e195bc7 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -9,10 +9,10 @@ Example of large parallel run, locally:
cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
"""
-import sys
-import json
import argparse
import datetime
+import json
+import sys
from sandcrawler import *
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 69e9374..d52f7c1 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -7,9 +7,9 @@ Normally this is done by workers (in sandcrawler_worker.py) consuming from
Kafka feeds, but sometimes we have bulk processing output we want to backfill.
"""
+import argparse
import os
import sys
-import argparse
from sandcrawler import *
from sandcrawler.persist import *
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 4e004be..bf2d92d 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,11 +1,14 @@
-from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
-from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
-from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
+from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
+from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
+from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult, SavePageNowClient,
+ SavePageNowError, WarcResource, WaybackClient, WaybackContentError, WaybackError)
from .ingest_file import IngestFileWorker
from .ingest_fileset import IngestFilesetWorker
-from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
-from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
+from .misc import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_datetime, parse_cdx_line
+from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
+from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
+from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker, PersistIngestFileResultWorker,
+ PersistIngestRequestWorker, PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker)
+from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink, KafkaJsonPusher, KafkaSink,
+ MultiprocessWrapper, ZipfilePusher)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 9b55c0c..4dcdb0e 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,12 +1,13 @@
-import json
import datetime
+import json
from typing import Optional
import psycopg2
import psycopg2.extras
import requests
+
class SandcrawlerPostgrestClient:
def __init__(self, api_url="http://wbgrp-svc506.us.archive.org:3030", **kwargs):
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 134ae7c..92fed37 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -1,18 +1,18 @@
-import sys
-import json
import gzip
+import json
+import sys
import time
import urllib.parse
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
-import requests
import internetarchive
+import requests
+from sandcrawler.fileset_types import *
from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult
-from sandcrawler.fileset_types import *
class FilesetPlatformHelper():
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index d12fc15..c9f182c 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -1,18 +1,19 @@
+import gzip
+import json
import os
+import shutil
import sys
-import json
-import gzip
import time
-import shutil
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
import internetarchive
+from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
+ PlatformScopeError)
from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding
-from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, FilesetPlatformItem, ArchiveStrategyResult, PlatformScopeError
+from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding
from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
index d7e9d6d..8ea136e 100644
--- a/python/sandcrawler/fileset_types.py
+++ b/python/sandcrawler/fileset_types.py
@@ -1,9 +1,10 @@
from enum import Enum
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
from pydantic import BaseModel
+
class IngestStrategy(str, Enum):
WebFile = "web-file"
WebFileset = "web-fileset"
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index b4215dc..5242b3a 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -2,8 +2,10 @@
import requests
from grobid2json import teixml2json
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+
from .misc import gen_file_metadata
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
class GrobidClient(object):
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index cd0a8e8..6bdebdd 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,7 +1,7 @@
+import json
import re
import sys
-import json
import urllib.parse
from bs4 import BeautifulSoup
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 93c7269..c6725dc 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,17 +1,16 @@
-import sys
import datetime
-from typing import List, Optional, Any, Tuple, Dict
+import sys
import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
+import braveblock
import dateparser
-from selectolax.parser import HTMLParser
import pydantic
-import braveblock
+from selectolax.parser import HTMLParser
from sandcrawler.misc import url_fuzzy_equal
-
# this is a map of metadata keys to CSS selectors
# sources for this list include:
# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a2ca346..ca1182f 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -3,29 +3,31 @@
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
+import datetime
+import gzip
+import http.client
+import json
import os
import sys
import time
-import gzip
-import json
-import requests
-import datetime
import urllib.parse
-import urllib3.exceptions
-from typing import Tuple
from collections import namedtuple
+from typing import Tuple
-import http.client
+import requests
+import urllib3.exceptions
# not sure this will really work. Should go before wayback imports.
http.client._MAXHEADERS = 1000 # type: ignore
-import wayback.exception
from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
+
+import wayback.exception
from gwb.loader import CDXLoaderFactory3
+from wayback.resourcestore import ResourceStore
+
+from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session
-from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
class SandcrawlerBackoffError(Exception):
"""
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 72d4e14..137a793 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -1,31 +1,31 @@
-import sys
-import json
+import base64
import gzip
+import json
+import sys
import time
-import base64
import xml.etree.ElementTree
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Any, Dict, List, Optional, Tuple
import requests
from selectolax.parser import HTMLParser
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
+from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import process_pdf, PdfExtractResult
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
- quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
- WebResource, html_guess_platform
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+ SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+ fix_transfer_encoding)
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+ html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
from sandcrawler.workers import SandcrawlerWorker
-from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.xml import xml_reserialize
-
MAX_BODY_SIZE_BYTES = 128*1024*1024
class IngestFileWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 7c0dfbd..11386df 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,29 +1,28 @@
-import sys
-import json
import gzip
+import json
+import sys
import time
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
import requests
from selectolax.parser import HTMLParser
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
- quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
- WebResource, html_guess_platform
-
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper
+from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
+from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+ SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+ fix_transfer_encoding)
from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.fileset_platforms import FilesetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE
-from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE
-from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError
-
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+ html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.workers import SandcrawlerWorker
MAX_BODY_SIZE_BYTES = 128*1024*1024
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 56a726d..7e6e5e3 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,20 +1,20 @@
+import argparse
+import datetime
import io
-import sys
import json
-import datetime
-import argparse
+import sys
import xml.etree.ElementTree as ET
-from typing import List, Optional, Any, Tuple
+from typing import Any, List, Optional, Tuple
-import trafilatura
import pydantic
+import trafilatura
from selectolax.parser import HTMLParser
-from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError
-from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError,
+ cdx_to_dict, fix_transfer_encoding)
+from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal
TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index c7deea1..b617178 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,7 +1,7 @@
+import hashlib
import io
import os
-import hashlib
import minio
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 37a2a82..cf8c4bd 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,15 +1,15 @@
-import os
import base64
-import magic
-import hashlib
import datetime
+import hashlib
+import os
from typing import Optional
+import magic
import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
def clean_url(s: str) -> str:
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 9b4e834..2fb34b8 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -1,17 +1,16 @@
-import sys
-import json
import datetime
-from io import BytesIO
+import json
+import sys
from dataclasses import dataclass
-from typing import Optional, Dict, Any
+from io import BytesIO
+from typing import Any, Dict, Optional
import poppler
from PIL import Image
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
from .misc import gen_file_metadata
-
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
# This is a hack to work around timeouts when processing certain PDFs with
# poppler. For some reason, the usual Kafka timeout catcher isn't working on
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 161dc9c..7d03357 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,9 +1,10 @@
import time
+
import requests
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class PdfTrioClient(object):
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index af702ca..66a36bc 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -20,15 +20,15 @@ grobid
"""
import os
-from typing import Optional, AnyStr
import xml.etree.ElementTree
+from typing import AnyStr, Optional
-from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgresClient
-from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import PdfExtractResult
from sandcrawler.ingest_html import HtmlMetaRow
+from sandcrawler.minio import SandcrawlerMinioClient
+from sandcrawler.pdfextract import PdfExtractResult
+from sandcrawler.workers import SandcrawlerWorker
class PersistCdxWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 37e3d7a..d8a4016 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,16 +1,17 @@
-import sys
import json
-import time
+import multiprocessing.pool
import signal
+import sys
+import time
import zipfile
-import requests
-import multiprocessing.pool
from collections import Counter
-from confluent_kafka import Consumer, Producer, KafkaException
+import requests
+from confluent_kafka import Consumer, KafkaException, Producer
+
+from .ia import PetaboxError, SandcrawlerBackoffError, WaybackContentError, WaybackError
from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
class SandcrawlerWorker(object):
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 8e275cf..e185fad 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -6,14 +6,15 @@ Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
or S3 (SeaweedFS).
"""
-import os
-import sys
import argparse
import datetime
+import os
+import sys
+
import raven
from sandcrawler import *
-from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker
+from sandcrawler.persist import PersistHtmlTeiXmlWorker, PersistXmlDocWorker
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
try:
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 03a1f29..69fe320 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -12,9 +12,9 @@ Run like:
Can then run through requests using that tool, or dump into kafka queue.
"""
-import sys
-import json
import argparse
+import json
+import sys
def run(args):
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 0e507eb..86ca062 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -9,13 +9,12 @@ TODO:
- should this check the item type?
"""
-import sys
import json
+import sys
from typing import Any
import internetarchive
-
FORMAT_TO_MIMETYPE = {
'BZIP': 'application/x-bzip',
'BZIP2': 'application/x-bzip2',
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
index e867b21..5e33def 100755
--- a/python/scripts/cdx_collection.py
+++ b/python/scripts/cdx_collection.py
@@ -11,12 +11,14 @@ Call with a collection name:
"""
import os
-import sys
import shutil
-import tempfile
-import requests
import subprocess
+import sys
+import tempfile
+
import internetarchive as ia
+import requests
+
def run():
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 33c425d..1b7c85c 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -4,9 +4,10 @@
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..62a85e6 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -23,12 +23,12 @@ Requires:
- boto3 (AWS S3 client library)
"""
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
import boto3
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index 3dcf962..ab1906a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -7,19 +7,19 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
import raven
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
+from wayback.resourcestore import ResourceStore
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
sentry_client = raven.Client()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 39ac000..f103205 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -33,20 +33,20 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
import boto3
import raven
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
+from wayback.resourcestore import ResourceStore
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
sentry_client = raven.Client()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index a7214d0..15b30a0 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -9,11 +9,12 @@ in the HTML headers and adds an ingest request on that basis. Or even just run
the re-ingest in-process and publish a second result.
"""
-import sys
-import json
import argparse
+import json
+import sys
+from typing import List, Optional
+
import urlcanon
-from typing import Optional, List
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 9fe1499..3085346 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -17,9 +17,10 @@ And outputs JSON objects that are can be imported into fatcat with the
No dependencies (only python3 stdlib)
"""
-import sys
-import json
import base64
+import json
+import sys
+
def run():
for line in sys.stdin:
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index dc4bea7..d0666ce 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
-import sys
import json
+import sys
with open('title_slug_denylist.txt', 'r') as f:
TITLE_DENYLIST = [l.strip() for l in f]
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..494da71 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
- dates differ (not just year)
"""
-import sys
import json
+import sys
# out of 1000
SCORE_THRESHOLD = 900
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3654b87..abf81bd 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).
No dependencies (only python3 stdlib)
"""
-import sys
import json
+import sys
# out of 1000
score_threshold = 900
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 79feac1..d391f60 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -10,11 +10,12 @@ Run in bulk like:
ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
"""
-import sys
import json
+import sys
from grobid2json import teixml2json
+
def parse_hbase(line):
line = line.split('\t')
assert len(line) == 2
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index d01b526..8aee0be 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
-import sys
-import json
import datetime
+import json
+import sys
MAX_ABSTRACT_BYTES=4096
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 494ec7a..acba2a8 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -7,9 +7,9 @@ format) back in to regular ingest request JSON.
The only difference is the name and location of some optional keys.
"""
-import sys
-import json
import argparse
+import json
+import sys
def transform(row):
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 35cee5b..8267003 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -10,9 +10,9 @@ This was used to convert this manifest:
to JSON format for fast fatcat importing.
"""
-import sys
import json
import sqlite3
+import sys
# iterate over rows in files metadata...
# 1. select all identified DOIs
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..315b8d2 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -6,9 +6,10 @@ Transform an OAI-PMH bulk dump (JSON) into ingest requests.
Eg: https://archive.org/details/oai_harvest_20200215
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index af08db6..71fbe54 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -7,6 +7,7 @@ Originally used to benchmark and compare file size/quality.
"""
import sys
+
import poppler
from PIL import Image
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 5536e6c..590b429 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -4,9 +4,10 @@
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 36d90ef..7d950df 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,11 +1,11 @@
-import pytest
import struct
-import responses
-from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient
-from test_wayback import wayback_client, cdx_client
+import pytest
+import responses
+from test_wayback import cdx_client, wayback_client
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker, WaybackClient
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index 8497b10..b8999b1 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -1,7 +1,9 @@
-import xml
import json
+import xml
+
import pytest
+
from grobid2json import *
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 9a81852..d4bffc1 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,10 +1,12 @@
import json
+
import pytest
import responses
from sandcrawler.html import extract_fulltext_url
+
def test_extract_fulltext_url():
resp = extract_fulltext_url("asdf", b"asdf")
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index efd1ddf..943e5da 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,5 +1,6 @@
import datetime
+
import pytest
from sandcrawler.ingest_html import *
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index bf26a98..7f35d55 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -1,5 +1,6 @@
import datetime
+
import pytest
from sandcrawler.html_metadata import *
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index b51f721..0965fcb 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -1,12 +1,13 @@
import json
+
import pytest
import responses
+from test_grobid import REAL_TEI_XML
+from test_savepagenow import *
+from test_wayback import *
from sandcrawler import *
-from test_wayback import *
-from test_savepagenow import *
-from test_grobid import REAL_TEI_XML
@pytest.fixture
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 429c6b0..b501dc3 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -8,9 +8,11 @@ Simply uncomment lines to run.
"""
import json
+
import pytest
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError, SavePageNowClient, SavePageNowError, CdxPartial, gen_file_metadata
+from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, SavePageNowError,
+ WaybackClient, WaybackError, gen_file_metadata)
@pytest.fixture
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index bd18e5c..0788c38 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,8 @@
import pytest
-from sandcrawler import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, clean_url
+from sandcrawler import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_line
+
def test_gen_file_metadata():
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 255e3fb..1d334d6 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -1,13 +1,13 @@
-import pytest
import struct
-import responses
+
import poppler
+import pytest
+import responses
+from test_wayback import cdx_client, wayback_client
-from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, WaybackClient
from sandcrawler.pdfextract import process_pdf
-from test_wayback import wayback_client, cdx_client
-
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 52f26c0..62fa515 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,7 +1,7 @@
import pytest
-from sandcrawler.workers import CdxLinePusher, BlackholeSink
+from sandcrawler.workers import BlackholeSink, CdxLinePusher
def test_cdx_line_pusher():
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 63dd887..f3fbfda 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -1,11 +1,11 @@
import json
+
import pytest
import responses
-
-from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial
from test_wayback import *
+from sandcrawler import CdxPartial, SavePageNowClient, SavePageNowError
TARGET = "http://dummy-target.dummy"
JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 6bc1ca4..83311b9 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -1,10 +1,10 @@
import json
+
import pytest
import responses
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
-
+from sandcrawler import CdxApiClient, CdxApiError, PetaboxError, WaybackClient, WaybackError
CDX_TARGET = "http://fatcat.wiki/"
CDX_DT = "20180812220054"