From 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 12:22:38 -0700 Subject: python: isort all imports --- python/grobid2json.py | 4 ++-- python/grobid_tool.py | 4 ++-- python/ia_pdf_match.py | 3 ++- python/ingest_tool.py | 6 +++--- python/pdfextract_tool.py | 4 ++-- python/pdftrio_tool.py | 4 ++-- python/persist_tool.py | 2 +- python/sandcrawler/__init__.py | 19 ++++++++++-------- python/sandcrawler/db.py | 3 ++- python/sandcrawler/fileset_platforms.py | 10 +++++----- python/sandcrawler/fileset_strategies.py | 13 +++++++------ python/sandcrawler/fileset_types.py | 3 ++- python/sandcrawler/grobid.py | 4 +++- python/sandcrawler/html.py | 2 +- python/sandcrawler/html_metadata.py | 9 ++++----- python/sandcrawler/ia.py | 22 +++++++++++---------- python/sandcrawler/ingest_file.py | 26 ++++++++++++------------- python/sandcrawler/ingest_fileset.py | 31 +++++++++++++++--------------- python/sandcrawler/ingest_html.py | 18 ++++++++--------- python/sandcrawler/minio.py | 2 +- python/sandcrawler/misc.py | 10 +++++----- python/sandcrawler/pdfextract.py | 11 +++++------ python/sandcrawler/pdftrio.py | 3 ++- python/sandcrawler/persist.py | 8 ++++---- python/sandcrawler/workers.py | 13 +++++++------ python/sandcrawler_worker.py | 7 ++++--- python/scripts/arabesque2ingestrequest.py | 4 ++-- python/scripts/archiveorg_fileset.py | 3 +-- python/scripts/cdx_collection.py | 8 +++++--- python/scripts/covid2ingestrequest.py | 5 +++-- python/scripts/deliver_dumpgrobid_to_s3.py | 8 ++++---- python/scripts/deliver_gwb_to_disk.py | 12 ++++++------ python/scripts/deliver_gwb_to_s3.py | 12 ++++++------ python/scripts/doaj2ingestrequest.py | 7 ++++--- python/scripts/enrich_scored_matches.py | 5 +++-- python/scripts/filter_grobid_metadata.py | 2 +- python/scripts/filter_groupworks.py | 2 +- python/scripts/filter_scored_matches.py | 2 +- python/scripts/grobid_affiliations.py | 3 ++- python/scripts/import_grobid_metadata.py | 4 ++-- python/scripts/ingestrequest_row2json.py | 4 ++-- python/scripts/manifest_converter.py | 2 +- python/scripts/oai2ingestrequest.py | 5 +++-- python/scripts/pdf_thumbnail.py | 1 + python/scripts/unpaywall2ingestrequest.py | 5 +++-- python/tests/test_grobid.py | 8 ++++---- python/tests/test_grobid2json.py | 4 +++- python/tests/test_html.py | 2 ++ python/tests/test_html_ingest.py | 1 + python/tests/test_html_metadata.py | 1 + python/tests/test_ingest.py | 7 ++++--- python/tests/test_live_wayback.py | 4 +++- python/tests/test_misc.py | 3 ++- python/tests/test_pdfextract.py | 10 +++++----- python/tests/test_pushers.py | 2 +- python/tests/test_savepagenow.py | 4 ++-- python/tests/test_wayback.py | 4 ++-- 57 files changed, 207 insertions(+), 178 deletions(-) diff --git a/python/grobid2json.py b/python/grobid2json.py index a22d47d..b4bfe2b 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -23,11 +23,11 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered): Prints JSON to stdout, errors to stderr """ +import argparse import io import json -import argparse import xml.etree.ElementTree as ET -from typing import List, Any, Dict, AnyStr, Optional +from typing import Any, AnyStr, Dict, List, Optional xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" diff --git a/python/grobid_tool.py b/python/grobid_tool.py index 2a1d8b5..0084330 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -9,10 +9,10 @@ Example of large parallel run, locally: cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - """ -import sys -import json import argparse import datetime +import json +import sys from grobid2json import teixml2json from sandcrawler import * diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py index 20c65bb..137110c 100755 --- a/python/ia_pdf_match.py +++ b/python/ia_pdf_match.py @@ -22,8 +22,9 @@ When invoking import matched, be sure to: --default-mimetype application/pdf """ -import sys import json +import sys + def parse(obj): if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'): diff --git a/python/ingest_tool.py b/python/ingest_tool.py index fdb5b48..c0ef5aa 100755 --- a/python/ingest_tool.py +++ b/python/ingest_tool.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 -import sys -import json import argparse - +import json +import sys from http.server import HTTPServer + from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker from sandcrawler.ingest_fileset import IngestFilesetWorker diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py index 10a0f48..89ecf1c 100755 --- a/python/pdfextract_tool.py +++ b/python/pdfextract_tool.py @@ -4,10 +4,10 @@ KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode """ -import sys -import json import argparse import datetime +import json +import sys from grobid2json import teixml2json from sandcrawler import * diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py index 5cffa8c..e195bc7 100755 --- a/python/pdftrio_tool.py +++ b/python/pdftrio_tool.py @@ -9,10 +9,10 @@ Example of large parallel run, locally: cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json - """ -import sys -import json import argparse import datetime +import json +import sys from sandcrawler import * diff --git a/python/persist_tool.py b/python/persist_tool.py index 69e9374..d52f7c1 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -7,9 +7,9 @@ Normally this is done by workers (in sandcrawler_worker.py) consuming from Kafka feeds, but sometimes we have bulk processing output we want to backfill. """ +import argparse import os import sys -import argparse from sandcrawler import * from sandcrawler.persist import * diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 4e004be..bf2d92d 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,11 +1,14 @@ -from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker -from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker -from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url -from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper -from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow +from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient +from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker +from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult, SavePageNowClient, + SavePageNowError, WarcResource, WaybackClient, WaybackContentError, WaybackError) from .ingest_file import IngestFileWorker from .ingest_fileset import IngestFilesetWorker -from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker -from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient -from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker +from .misc import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_datetime, parse_cdx_line +from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker +from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker +from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker, PersistIngestFileResultWorker, + PersistIngestRequestWorker, PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker) +from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink, KafkaJsonPusher, KafkaSink, + MultiprocessWrapper, ZipfilePusher) diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 9b55c0c..4dcdb0e 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -1,12 +1,13 @@ -import json import datetime +import json from typing import Optional import psycopg2 import psycopg2.extras import requests + class SandcrawlerPostgrestClient: def __init__(self, api_url="http://wbgrp-svc506.us.archive.org:3030", **kwargs): diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 134ae7c..92fed37 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -1,18 +1,18 @@ -import sys -import json import gzip +import json +import sys import time import urllib.parse from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple -import requests import internetarchive +import requests +from sandcrawler.fileset_types import * from sandcrawler.html_metadata import BiblioMetadata from sandcrawler.ia import ResourceResult -from sandcrawler.fileset_types import * class FilesetPlatformHelper(): diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index d12fc15..c9f182c 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -1,18 +1,19 @@ +import gzip +import json import os +import shutil import sys -import json -import gzip import time -import shutil from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple import internetarchive +from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy, + PlatformScopeError) from sandcrawler.html_metadata import BiblioMetadata -from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding -from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, FilesetPlatformItem, ArchiveStrategyResult, PlatformScopeError +from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py index d7e9d6d..8ea136e 100644 --- a/python/sandcrawler/fileset_types.py +++ b/python/sandcrawler/fileset_types.py @@ -1,9 +1,10 @@ from enum import Enum -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple from pydantic import BaseModel + class IngestStrategy(str, Enum): WebFile = "web-file" WebFileset = "web-fileset" diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index b4215dc..5242b3a 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -2,8 +2,10 @@ import requests from grobid2json import teixml2json -from .workers import SandcrawlerWorker, SandcrawlerFetchWorker + from .misc import gen_file_metadata +from .workers import SandcrawlerFetchWorker, SandcrawlerWorker + class GrobidClient(object): diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index cd0a8e8..6bdebdd 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -1,7 +1,7 @@ +import json import re import sys -import json import urllib.parse from bs4 import BeautifulSoup diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 93c7269..c6725dc 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,17 +1,16 @@ -import sys import datetime -from typing import List, Optional, Any, Tuple, Dict +import sys import urllib.parse +from typing import Any, Dict, List, Optional, Tuple +import braveblock import dateparser -from selectolax.parser import HTMLParser import pydantic -import braveblock +from selectolax.parser import HTMLParser from sandcrawler.misc import url_fuzzy_equal - # this is a map of metadata keys to CSS selectors # sources for this list include: # - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index a2ca346..ca1182f 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -3,29 +3,31 @@ # in `wayback` library. Means we can't run pylint. # pylint: skip-file +import datetime +import gzip +import http.client +import json import os import sys import time -import gzip -import json -import requests -import datetime import urllib.parse -import urllib3.exceptions -from typing import Tuple from collections import namedtuple +from typing import Tuple -import http.client +import requests +import urllib3.exceptions # not sure this will really work. Should go before wayback imports. http.client._MAXHEADERS = 1000 # type: ignore -import wayback.exception from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore + +import wayback.exception from gwb.loader import CDXLoaderFactory3 +from wayback.resourcestore import ResourceStore + +from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session -from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url class SandcrawlerBackoffError(Exception): """ diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 72d4e14..137a793 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -1,31 +1,31 @@ -import sys -import json +import base64 import gzip +import json +import sys import time -import base64 import xml.etree.ElementTree from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Any, Dict, List, Optional, Tuple import requests from selectolax.parser import HTMLParser -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError +from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.grobid import GrobidClient -from sandcrawler.pdfextract import process_pdf, PdfExtractResult -from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime from sandcrawler.html import extract_fulltext_url -from sandcrawler.ingest_html import fetch_html_resources, \ - quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ - WebResource, html_guess_platform -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient, + SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, + fix_transfer_encoding) +from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform, + html_guess_scope, quick_fetch_html_resources) +from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime +from sandcrawler.pdfextract import PdfExtractResult, process_pdf from sandcrawler.workers import SandcrawlerWorker -from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.xml import xml_reserialize - MAX_BODY_SIZE_BYTES = 128*1024*1024 class IngestFileWorker(SandcrawlerWorker): diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 7c0dfbd..11386df 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -1,29 +1,28 @@ -import sys -import json import gzip +import json +import sys import time from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple import requests from selectolax.parser import HTMLParser -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError -from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime -from sandcrawler.html import extract_fulltext_url -from sandcrawler.ingest_html import fetch_html_resources, \ - quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ - WebResource, html_guess_platform - -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules -from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient +from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper +from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy +from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError +from sandcrawler.html import extract_fulltext_url +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient, + SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, + fix_transfer_encoding) from sandcrawler.ingest_file import IngestFileWorker -from sandcrawler.fileset_platforms import FilesetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE -from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE -from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError - +from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform, + html_guess_scope, quick_fetch_html_resources) +from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime +from sandcrawler.workers import SandcrawlerWorker MAX_BODY_SIZE_BYTES = 128*1024*1024 diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 56a726d..7e6e5e3 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -1,20 +1,20 @@ +import argparse +import datetime import io -import sys import json -import datetime -import argparse +import sys import xml.etree.ElementTree as ET -from typing import List, Optional, Any, Tuple +from typing import Any, List, Optional, Tuple -import trafilatura import pydantic +import trafilatura from selectolax.parser import HTMLParser -from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError -from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules - +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError, + cdx_to_dict, fix_transfer_encoding) +from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py index c7deea1..b617178 100644 --- a/python/sandcrawler/minio.py +++ b/python/sandcrawler/minio.py @@ -1,7 +1,7 @@ +import hashlib import io import os -import hashlib import minio diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 37a2a82..cf8c4bd 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -1,15 +1,15 @@ -import os import base64 -import magic -import hashlib import datetime +import hashlib +import os from typing import Optional +import magic import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error import urlcanon +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error def clean_url(s: str) -> str: diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 9b4e834..2fb34b8 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -1,17 +1,16 @@ -import sys -import json import datetime -from io import BytesIO +import json +import sys from dataclasses import dataclass -from typing import Optional, Dict, Any +from io import BytesIO +from typing import Any, Dict, Optional import poppler from PIL import Image -from .workers import SandcrawlerWorker, SandcrawlerFetchWorker from .misc import gen_file_metadata - +from .workers import SandcrawlerFetchWorker, SandcrawlerWorker # This is a hack to work around timeouts when processing certain PDFs with # poppler. For some reason, the usual Kafka timeout catcher isn't working on diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 161dc9c..7d03357 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -1,9 +1,10 @@ import time + import requests -from .workers import SandcrawlerWorker, SandcrawlerFetchWorker from .misc import gen_file_metadata, requests_retry_session +from .workers import SandcrawlerFetchWorker, SandcrawlerWorker class PdfTrioClient(object): diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index af702ca..66a36bc 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -20,15 +20,15 @@ grobid """ import os -from typing import Optional, AnyStr import xml.etree.ElementTree +from typing import AnyStr, Optional -from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgresClient -from sandcrawler.minio import SandcrawlerMinioClient from sandcrawler.grobid import GrobidClient -from sandcrawler.pdfextract import PdfExtractResult from sandcrawler.ingest_html import HtmlMetaRow +from sandcrawler.minio import SandcrawlerMinioClient +from sandcrawler.pdfextract import PdfExtractResult +from sandcrawler.workers import SandcrawlerWorker class PersistCdxWorker(SandcrawlerWorker): diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index 37e3d7a..d8a4016 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -1,16 +1,17 @@ -import sys import json -import time +import multiprocessing.pool import signal +import sys +import time import zipfile -import requests -import multiprocessing.pool from collections import Counter -from confluent_kafka import Consumer, Producer, KafkaException +import requests +from confluent_kafka import Consumer, KafkaException, Producer + +from .ia import PetaboxError, SandcrawlerBackoffError, WaybackContentError, WaybackError from .misc import parse_cdx_line -from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError class SandcrawlerWorker(object): diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 8e275cf..e185fad 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -6,14 +6,15 @@ Outputs might either be pushed back into Kafka, or directly into sandcrawler-db or S3 (SeaweedFS). """ -import os -import sys import argparse import datetime +import os +import sys + import raven from sandcrawler import * -from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker +from sandcrawler.persist import PersistHtmlTeiXmlWorker, PersistXmlDocWorker # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable try: diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index 03a1f29..69fe320 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -12,9 +12,9 @@ Run like: Can then run through requests using that tool, or dump into kafka queue. """ -import sys -import json import argparse +import json +import sys def run(args): diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py index 0e507eb..86ca062 100755 --- a/python/scripts/archiveorg_fileset.py +++ b/python/scripts/archiveorg_fileset.py @@ -9,13 +9,12 @@ TODO: - should this check the item type? """ -import sys import json +import sys from typing import Any import internetarchive - FORMAT_TO_MIMETYPE = { 'BZIP': 'application/x-bzip', 'BZIP2': 'application/x-bzip2', diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py index e867b21..5e33def 100755 --- a/python/scripts/cdx_collection.py +++ b/python/scripts/cdx_collection.py @@ -11,12 +11,14 @@ Call with a collection name: """ import os -import sys import shutil -import tempfile -import requests import subprocess +import sys +import tempfile + import internetarchive as ia +import requests + def run(): diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py index 33c425d..1b7c85c 100755 --- a/python/scripts/covid2ingestrequest.py +++ b/python/scripts/covid2ingestrequest.py @@ -4,9 +4,10 @@ Transform an unpaywall dump (JSON) into ingest requests. """ -import sys -import json import argparse +import json +import sys + import urlcanon diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 86b3b35..62a85e6 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -23,12 +23,12 @@ Requires: - boto3 (AWS S3 client library) """ -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter import boto3 diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py index 3dcf962..ab1906a 100755 --- a/python/scripts/deliver_gwb_to_disk.py +++ b/python/scripts/deliver_gwb_to_disk.py @@ -7,19 +7,19 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk. # in `wayback` library. Means we can't run pylint. # pylint: skip-file -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter +from http.client import IncompleteRead import raven import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory +from wayback.resourcestore import ResourceStore # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable sentry_client = raven.Client() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index 39ac000..f103205 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -33,20 +33,20 @@ Requires: # in `wayback` library. Means we can't run pylint. # pylint: skip-file -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter +from http.client import IncompleteRead import boto3 import raven import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory +from wayback.resourcestore import ResourceStore # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable sentry_client = raven.Client() diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index a7214d0..15b30a0 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -9,11 +9,12 @@ in the HTML headers and adds an ingest request on that basis. Or even just run the re-ingest in-process and publish a second result. """ -import sys -import json import argparse +import json +import sys +from typing import List, Optional + import urlcanon -from typing import Optional, List DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py index 9fe1499..3085346 100755 --- a/python/scripts/enrich_scored_matches.py +++ b/python/scripts/enrich_scored_matches.py @@ -17,9 +17,10 @@ And outputs JSON objects that are can be imported into fatcat with the No dependencies (only python3 stdlib) """ -import sys -import json import base64 +import json +import sys + def run(): for line in sys.stdin: diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index dc4bea7..d0666ce 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -import sys import json +import sys with open('title_slug_denylist.txt', 'r') as f: TITLE_DENYLIST = [l.strip() for l in f] diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py index bbba770..494da71 100755 --- a/python/scripts/filter_groupworks.py +++ b/python/scripts/filter_groupworks.py @@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out: - dates differ (not just year) """ -import sys import json +import sys # out of 1000 SCORE_THRESHOLD = 900 diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index 3654b87..abf81bd 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file). No dependencies (only python3 stdlib) """ -import sys import json +import sys # out of 1000 score_threshold = 900 diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index 79feac1..d391f60 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -10,11 +10,12 @@ Run in bulk like: ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations' """ -import sys import json +import sys from grobid2json import teixml2json + def parse_hbase(line): line = line.split('\t') assert len(line) == 2 diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py index d01b526..8aee0be 100755 --- a/python/scripts/import_grobid_metadata.py +++ b/python/scripts/import_grobid_metadata.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -import sys -import json import datetime +import json +import sys MAX_ABSTRACT_BYTES=4096 diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 494ec7a..acba2a8 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -7,9 +7,9 @@ format) back in to regular ingest request JSON. The only difference is the name and location of some optional keys. """ -import sys -import json import argparse +import json +import sys def transform(row): diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py index 35cee5b..8267003 100755 --- a/python/scripts/manifest_converter.py +++ b/python/scripts/manifest_converter.py @@ -10,9 +10,9 @@ This was used to convert this manifest: to JSON format for fast fatcat importing. """ -import sys import json import sqlite3 +import sys # iterate over rows in files metadata... # 1. select all identified DOIs diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 916f41c..315b8d2 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -6,9 +6,10 @@ Transform an OAI-PMH bulk dump (JSON) into ingest requests. Eg: https://archive.org/details/oai_harvest_20200215 """ -import sys -import json import argparse +import json +import sys + import urlcanon DOMAIN_BLOCKLIST = [ diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py index af08db6..71fbe54 100755 --- a/python/scripts/pdf_thumbnail.py +++ b/python/scripts/pdf_thumbnail.py @@ -7,6 +7,7 @@ Originally used to benchmark and compare file size/quality. """ import sys + import poppler from PIL import Image diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 5536e6c..590b429 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -4,9 +4,10 @@ Transform an unpaywall dump (JSON) into ingest requests. """ -import sys -import json import argparse +import json +import sys + import urlcanon DOMAIN_BLOCKLIST = [ diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index 36d90ef..7d950df 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -1,11 +1,11 @@ -import pytest import struct -import responses -from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient -from test_wayback import wayback_client, cdx_client +import pytest +import responses +from test_wayback import cdx_client, wayback_client +from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker, WaybackClient FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py index 8497b10..b8999b1 100644 --- a/python/tests/test_grobid2json.py +++ b/python/tests/test_grobid2json.py @@ -1,7 +1,9 @@ -import xml import json +import xml + import pytest + from grobid2json import * diff --git a/python/tests/test_html.py b/python/tests/test_html.py index 9a81852..d4bffc1 100644 --- a/python/tests/test_html.py +++ b/python/tests/test_html.py @@ -1,10 +1,12 @@ import json + import pytest import responses from sandcrawler.html import extract_fulltext_url + def test_extract_fulltext_url(): resp = extract_fulltext_url("asdf", b"asdf") diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py index efd1ddf..943e5da 100644 --- a/python/tests/test_html_ingest.py +++ b/python/tests/test_html_ingest.py @@ -1,5 +1,6 @@ import datetime + import pytest from sandcrawler.ingest_html import * diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py index bf26a98..7f35d55 100644 --- a/python/tests/test_html_metadata.py +++ b/python/tests/test_html_metadata.py @@ -1,5 +1,6 @@ import datetime + import pytest from sandcrawler.html_metadata import * diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index b51f721..0965fcb 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -1,12 +1,13 @@ import json + import pytest import responses +from test_grobid import REAL_TEI_XML +from test_savepagenow import * +from test_wayback import * from sandcrawler import * -from test_wayback import * -from test_savepagenow import * -from test_grobid import REAL_TEI_XML @pytest.fixture diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index 429c6b0..b501dc3 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -8,9 +8,11 @@ Simply uncomment lines to run. """ import json + import pytest -from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError, SavePageNowClient, SavePageNowError, CdxPartial, gen_file_metadata +from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, SavePageNowError, + WaybackClient, WaybackError, gen_file_metadata) @pytest.fixture diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index bd18e5c..0788c38 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,7 +1,8 @@ import pytest -from sandcrawler import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, clean_url +from sandcrawler import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_line + def test_gen_file_metadata(): diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py index 255e3fb..1d334d6 100644 --- a/python/tests/test_pdfextract.py +++ b/python/tests/test_pdfextract.py @@ -1,13 +1,13 @@ -import pytest import struct -import responses + import poppler +import pytest +import responses +from test_wayback import cdx_client, wayback_client -from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient +from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, WaybackClient from sandcrawler.pdfextract import process_pdf -from test_wayback import wayback_client, cdx_client - FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py index 52f26c0..62fa515 100644 --- a/python/tests/test_pushers.py +++ b/python/tests/test_pushers.py @@ -1,7 +1,7 @@ import pytest -from sandcrawler.workers import CdxLinePusher, BlackholeSink +from sandcrawler.workers import BlackholeSink, CdxLinePusher def test_cdx_line_pusher(): diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py index 63dd887..f3fbfda 100644 --- a/python/tests/test_savepagenow.py +++ b/python/tests/test_savepagenow.py @@ -1,11 +1,11 @@ import json + import pytest import responses - -from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial from test_wayback import * +from sandcrawler import CdxPartial, SavePageNowClient, SavePageNowError TARGET = "http://dummy-target.dummy" JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8" diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 6bc1ca4..83311b9 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -1,10 +1,10 @@ import json + import pytest import responses -from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError - +from sandcrawler import CdxApiClient, CdxApiError, PetaboxError, WaybackClient, WaybackError CDX_TARGET = "http://fatcat.wiki/" CDX_DT = "20180812220054" -- cgit v1.2.3