aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 13:35:36 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 13:35:36 -0700
commit600ad67925a748200ddf21d5aeabd157d2bb3664 (patch)
tree89ae6bc24e6eb3821c03efd7d781430345c68aa0 /python/sandcrawler
parent05bd7cbcc62588e431c5efd533189e246b2a997e (diff)
downloadsandcrawler-600ad67925a748200ddf21d5aeabd157d2bb3664.tar.gz
sandcrawler-600ad67925a748200ddf21d5aeabd157d2bb3664.zip
start handling trivial lint cleanups: unused imports, 'is None', etc
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/fileset_platforms.py20
-rw-r--r--python/sandcrawler/fileset_strategies.py16
-rw-r--r--python/sandcrawler/fileset_types.py2
-rw-r--r--python/sandcrawler/grobid.py4
-rw-r--r--python/sandcrawler/html.py4
-rw-r--r--python/sandcrawler/html_metadata.py6
-rw-r--r--python/sandcrawler/ia.py29
-rw-r--r--python/sandcrawler/ingest_file.py22
-rw-r--r--python/sandcrawler/ingest_fileset.py29
-rw-r--r--python/sandcrawler/ingest_html.py9
-rw-r--r--python/sandcrawler/minio.py1
-rw-r--r--python/sandcrawler/misc.py8
-rw-r--r--python/sandcrawler/pdfextract.py2
-rw-r--r--python/sandcrawler/pdftrio.py1
-rw-r--r--python/sandcrawler/workers.py1
15 files changed, 57 insertions, 97 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index f3441c9..2811100 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -1,10 +1,5 @@
-import gzip
-import json
-import sys
-import time
import urllib.parse
-from collections import namedtuple
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Optional, Tuple
import internetarchive
import requests
@@ -175,12 +170,12 @@ class DataverseHelper(FilesetPlatformHelper):
try:
parsed_id = self.parse_dataverse_persistentid(platform_id)
except ValueError:
- raise PlatformScopeError(f"not actually in scope")
+ raise PlatformScopeError("not actually in scope")
if parsed_id['file_id']:
# XXX: maybe we could support this?
raise PlatformScopeError(
- f"only entire dataverse datasets can be archived with this tool")
+ "only entire dataverse datasets can be archived with this tool")
# 1b. if we didn't get a version number from URL, fetch it from API
if not dataset_version:
@@ -277,13 +272,6 @@ def test_parse_dataverse_persistentid():
"dataset_id": "LL6WXZ",
"file_id": None,
},
- "doi:10.25625/LL6WXZ": {
- "type": "doi",
- "authority": "10.25625",
- "shoulder": None,
- "dataset_id": "LL6WXZ",
- "file_id": None,
- },
"doi:10.5072/FK2/J8SJZB": {
"type": "doi",
"authority": "10.5072",
@@ -423,7 +411,7 @@ class FigshareHelper(FilesetPlatformHelper):
resp.raise_for_status()
obj = resp.json()
- figshare_type = obj['defined_type_name']
+ _figshare_type = obj['defined_type_name']
if not obj['is_public']:
raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}')
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 6c25276..4e44d97 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -1,19 +1,13 @@
-import gzip
-import json
import os
import shutil
import sys
-import time
-from collections import namedtuple
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Optional
import internetarchive
-from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile,
- FilesetPlatformItem, IngestStrategy, PlatformScopeError)
-from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import (ResourceResult, SavePageNowClient, WaybackClient,
- fix_transfer_encoding)
+from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetPlatformItem,
+ IngestStrategy, PlatformScopeError)
+from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
@@ -233,7 +227,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
via = "wayback"
resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
- if self.try_spn2 and (resource == None or
+ if self.try_spn2 and (resource is None or
(resource and resource.status == 'no-capture')):
if len(item.manifest) > self.max_spn_manifest:
m.status = 'too-much-spn'
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
index 606af07..f543ede 100644
--- a/python/sandcrawler/fileset_types.py
+++ b/python/sandcrawler/fileset_types.py
@@ -1,5 +1,5 @@
from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
from pydantic import BaseModel
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 16bbb01..d0b7f7e 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -23,7 +23,7 @@ class GrobidClient(object):
"""
assert blob
- if consolidate_mode == None:
+ if consolidate_mode is None:
consolidate_mode = self.consolidate_mode
try:
@@ -100,8 +100,6 @@ class GrobidWorker(SandcrawlerFetchWorker):
)
def process(self, record, key=None):
- default_key = record['sha1hex']
-
fetch_result = self.fetch_blob(record)
if fetch_result['status'] != 'success':
return fetch_result
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index a44fc67..5b9742a 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -53,12 +53,12 @@ def extract_fulltext_url(html_url, html_body):
print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
elif url.startswith('/'):
if host_prefix + url == html_url:
- print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
+ print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
else:
return dict(pdf_url=host_prefix + url, technique='citation_pdf_url')
elif url.startswith('http'):
if url == html_url:
- print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
+ print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
else:
return dict(pdf_url=url, technique='citation_pdf_url')
else:
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 6d27a3a..15a9f2b 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,7 +1,7 @@
import datetime
import sys
import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, List, Optional, Tuple
import braveblock
import dateparser
@@ -687,7 +687,7 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser,
continue
return (val, pattern.get('technique', 'unknown'))
if self_doc_url:
- print(f" WARN: returning fulltext URL pointing to self", file=sys.stderr)
+ print(" WARN: returning fulltext URL pointing to self", file=sys.stderr)
return self_doc_url
return None
@@ -864,7 +864,7 @@ def html_extract_resources(doc_url: str, doc: HTMLParser,
# filter using adblocker
resources = [
r for r in resources if adblock.check_network_urls(
- r['url'], source_url=doc_url, request_type=r['type']) == False
+ r['url'], source_url=doc_url, request_type=r['type']) is False
]
# remove duplicates
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a8ce193..fe739bb 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -11,15 +11,14 @@ import sys
import time
import urllib.parse
from collections import namedtuple
+from http.client import IncompleteRead
from typing import Tuple
import requests
import urllib3.exceptions
# not sure this will really work. Should go before wayback imports.
-http.client._MAXHEADERS = 1000 # type: ignore
-
-from http.client import IncompleteRead
+http.client._MAXHEADERS = 1000 # noqa
import wayback.exception
from gwb.loader import CDXLoaderFactory3
@@ -128,18 +127,18 @@ def fuzzy_match_url(left, right):
def test_fuzzy_match_url():
- assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True
- assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True
- assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
+ assert fuzzy_match_url("http://thing.com", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "https://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
# should probably handle these?
- assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
- assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
- assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False
+ assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") is False
class CdxApiError(Exception):
@@ -951,7 +950,7 @@ class SavePageNowClient:
resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, job_id))
try:
resp.raise_for_status()
- except:
+ except Exception:
raise SavePageNowError(resp.content)
status = resp.json()['status']
if status == 'pending':
@@ -975,7 +974,7 @@ class SavePageNowClient:
final_json['original_job_id']))
try:
resp.raise_for_status()
- except:
+ except Exception:
raise SavePageNowError(resp.content)
final_json = resp.json()
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index b480cc2..556e573 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -1,23 +1,19 @@
-import base64
-import gzip
import json
import sys
import time
import xml.etree.ElementTree
-from collections import namedtuple
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from typing import Any, Dict, List, Optional, Tuple
+from http.server import BaseHTTPRequestHandler
+from typing import Any, Dict, List, Optional
-import requests
from selectolax.parser import HTMLParser
from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.grobid import GrobidClient
from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
- html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError,
- ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient,
+from sandcrawler.html_metadata import (html_extract_biblio, html_extract_resources,
+ load_adblock_rules)
+from sandcrawler.ia import (CdxApiError, NoCaptureError, PetaboxError, ResourceResult,
+ SavePageNowClient, SavePageNowError, WaybackClient,
WaybackContentError, WaybackError, cdx_to_dict,
fix_transfer_encoding)
from sandcrawler.ingest_html import (WebResource, fetch_html_resources,
@@ -211,7 +207,7 @@ class IngestFileWorker(SandcrawlerWorker):
return None
existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
# TODO: filter on more flags?
- if existing and existing['hit'] == True:
+ if existing and existing['hit'] is True:
return existing
else:
return None
@@ -249,7 +245,7 @@ class IngestFileWorker(SandcrawlerWorker):
if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000':
old_failure = True
- if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')
+ if self.try_spn2 and (resource is None or (resource and resource.status == 'no-capture')
or soft404 or old_failure):
via = "spn2"
resource = self.spn_client.crawl_resource(url, self.wayback_client)
@@ -751,7 +747,7 @@ class IngestFileWorker(SandcrawlerWorker):
# fetch must be a hit if we got this far (though not necessarily an ingest hit!)
assert resource
- assert resource.hit == True
+ assert resource.hit is True
assert resource.terminal_status_code in (200, 226)
if resource.terminal_url:
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 5cbb908..4376c89 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,30 +1,19 @@
-import gzip
import json
import sys
import time
-from collections import namedtuple
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Optional
import requests
from selectolax.parser import HTMLParser
-from sandcrawler.db import SandcrawlerPostgrestClient
-from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper
-from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
+from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE
+from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE
from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
- html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError,
- ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient,
- WaybackContentError, WaybackError, cdx_to_dict,
- fix_transfer_encoding)
+from sandcrawler.html_metadata import html_extract_biblio
+from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError,
+ WaybackError, cdx_to_dict, fix_transfer_encoding)
from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.ingest_html import (WebResource, fetch_html_resources,
- html_extract_body_teixml, html_guess_platform,
- html_guess_scope, quick_fetch_html_resources)
-from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
-from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.misc import clean_url, gen_file_metadata
MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
@@ -61,7 +50,7 @@ class IngestFilesetWorker(IngestFileWorker):
return None
existing = self.pgrest_client.get_ingest_fileset_result(ingest_type, base_url)
# TODO: filter on more flags?
- if existing and existing['hit'] == True:
+ if existing and existing['hit'] is True:
return existing
else:
return None
@@ -196,7 +185,7 @@ class IngestFilesetWorker(IngestFileWorker):
# fetch must be a hit if we got this far (though not necessarily an ingest hit!)
assert resource
- assert resource.hit == True
+ assert resource.hit is True
assert resource.terminal_status_code in (200, 226)
if resource.terminal_url:
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index bf25d5d..91e5c6e 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,6 +1,5 @@
import argparse
import datetime
-import io
import json
import sys
import xml.etree.ElementTree as ET
@@ -12,9 +11,9 @@ from selectolax.parser import HTMLParser
from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient,
- WaybackContentError, cdx_to_dict, fix_transfer_encoding)
-from sandcrawler.misc import (clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime,
+from sandcrawler.ia import (CdxApiClient, NoCaptureError, WaybackClient, WaybackContentError,
+ cdx_to_dict, fix_transfer_encoding)
+from sandcrawler.misc import (datetime_to_cdx, gen_file_metadata, parse_cdx_datetime,
url_fuzzy_equal)
TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
@@ -147,7 +146,7 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
file=sys.stderr)
if not cdx_row.status_code:
# TODO: fall back to a full fetch?
- print(f" WARN: skipping revisit record", file=sys.stderr)
+ print(" WARN: skipping revisit record", file=sys.stderr)
continue
full.append(
WebResource(
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 188621f..046db9e 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,6 +1,5 @@
import hashlib
import io
-import os
import minio
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index ddbd95a..5ca7a4b 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -35,10 +35,10 @@ def url_fuzzy_equal(left: str, right: str) -> bool:
def test_url_fuzzy_equal() -> None:
- assert True == url_fuzzy_equal(
+ assert url_fuzzy_equal(
"http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
"http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree"
- )
+ ) is True
def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
@@ -239,8 +239,8 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
def test_parse_cdx_datetime() -> None:
- assert parse_cdx_datetime("") == None
- assert parse_cdx_datetime("asdf") == None
+ assert parse_cdx_datetime("") is None
+ assert parse_cdx_datetime("asdf") is None
assert parse_cdx_datetime("19930203123045") != None
assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020,
month=10,
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 190672d..9392136 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -293,7 +293,7 @@ def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExt
return PdfExtractResult(
sha1hex=sha1hex,
status='bad-pdf',
- error_msg=f"PDF known to cause processing issues",
+ error_msg="PDF known to cause processing issues",
file_meta=file_meta,
)
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index e3d4a54..ba875cd 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -75,7 +75,6 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
def process(self, record, key=None):
start_process = time.time()
- default_key = record['sha1hex']
fetch_sec = None
start = time.time()
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 7135f4c..8c604fb 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -116,7 +116,6 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
self.wayback_client = wayback_client
def fetch_blob(self, record):
- start_process = time.time()
default_key = record['sha1hex']
wayback_sec = None
petabox_sec = None