diff options
Diffstat (limited to 'python')
30 files changed, 86 insertions, 149 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index 4ba9540..c36fe0a 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -9,7 +9,6 @@ Example of large parallel run, locally: """ import argparse -import datetime import json import sys diff --git a/python/ingest_tool.py b/python/ingest_tool.py index 305c3a8..eb1047d 100755 --- a/python/ingest_tool.py +++ b/python/ingest_tool.py @@ -42,8 +42,8 @@ def run_requests(args): html_quick_mode=args.html_quick_mode, ) fileset_worker = IngestFilesetWorker(try_spn2=not args.no_spn2, ) - for l in args.json_file: - request = json.loads(l.strip()) + for line in args.json_file: + request = json.loads(line.strip()) if request['ingest_type'] in [ 'dataset', ]: diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py index 717b743..f3df6e3 100755 --- a/python/pdfextract_tool.py +++ b/python/pdfextract_tool.py @@ -4,11 +4,8 @@ KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode """ import argparse -import datetime -import json import sys -from grobid2json import teixml2json from sandcrawler import * diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py index 9316313..dbe5b10 100755 --- a/python/pdftrio_tool.py +++ b/python/pdftrio_tool.py @@ -9,8 +9,6 @@ cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftri """ import argparse -import datetime -import json import sys from sandcrawler import * diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index f3441c9..2811100 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -1,10 +1,5 @@ -import gzip -import json -import sys -import time import urllib.parse -from collections import namedtuple -from typing import Any, Dict, List, Optional, Tuple +from typing import Optional, Tuple import internetarchive import requests @@ -175,12 +170,12 @@ class DataverseHelper(FilesetPlatformHelper): try: parsed_id = self.parse_dataverse_persistentid(platform_id) except ValueError: - raise PlatformScopeError(f"not actually in scope") + raise PlatformScopeError("not actually in scope") if parsed_id['file_id']: # XXX: maybe we could support this? raise PlatformScopeError( - f"only entire dataverse datasets can be archived with this tool") + "only entire dataverse datasets can be archived with this tool") # 1b. if we didn't get a version number from URL, fetch it from API if not dataset_version: @@ -277,13 +272,6 @@ def test_parse_dataverse_persistentid(): "dataset_id": "LL6WXZ", "file_id": None, }, - "doi:10.25625/LL6WXZ": { - "type": "doi", - "authority": "10.25625", - "shoulder": None, - "dataset_id": "LL6WXZ", - "file_id": None, - }, "doi:10.5072/FK2/J8SJZB": { "type": "doi", "authority": "10.5072", @@ -423,7 +411,7 @@ class FigshareHelper(FilesetPlatformHelper): resp.raise_for_status() obj = resp.json() - figshare_type = obj['defined_type_name'] + _figshare_type = obj['defined_type_name'] if not obj['is_public']: raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}') diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 6c25276..4e44d97 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -1,19 +1,13 @@ -import gzip -import json import os import shutil import sys -import time -from collections import namedtuple -from typing import Any, Dict, List, Optional, Tuple +from typing import Optional import internetarchive -from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, - FilesetPlatformItem, IngestStrategy, PlatformScopeError) -from sandcrawler.html_metadata import BiblioMetadata -from sandcrawler.ia import (ResourceResult, SavePageNowClient, WaybackClient, - fix_transfer_encoding) +from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetPlatformItem, + IngestStrategy, PlatformScopeError) +from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path @@ -233,7 +227,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): via = "wayback" resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype) - if self.try_spn2 and (resource == None or + if self.try_spn2 and (resource is None or (resource and resource.status == 'no-capture')): if len(item.manifest) > self.max_spn_manifest: m.status = 'too-much-spn' diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py index 606af07..f543ede 100644 --- a/python/sandcrawler/fileset_types.py +++ b/python/sandcrawler/fileset_types.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional from pydantic import BaseModel diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 16bbb01..d0b7f7e 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -23,7 +23,7 @@ class GrobidClient(object): """ assert blob - if consolidate_mode == None: + if consolidate_mode is None: consolidate_mode = self.consolidate_mode try: @@ -100,8 +100,6 @@ class GrobidWorker(SandcrawlerFetchWorker): ) def process(self, record, key=None): - default_key = record['sha1hex'] - fetch_result = self.fetch_blob(record) if fetch_result['status'] != 'success': return fetch_result diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index a44fc67..5b9742a 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -53,12 +53,12 @@ def extract_fulltext_url(html_url, html_body): print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr) elif url.startswith('/'): if host_prefix + url == html_url: - print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr) + print("\tavoiding citation_pdf_url link-loop", file=sys.stderr) else: return dict(pdf_url=host_prefix + url, technique='citation_pdf_url') elif url.startswith('http'): if url == html_url: - print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr) + print("\tavoiding citation_pdf_url link-loop", file=sys.stderr) else: return dict(pdf_url=url, technique='citation_pdf_url') else: diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 6d27a3a..15a9f2b 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,7 +1,7 @@ import datetime import sys import urllib.parse -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, List, Optional, Tuple import braveblock import dateparser @@ -687,7 +687,7 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, continue return (val, pattern.get('technique', 'unknown')) if self_doc_url: - print(f" WARN: returning fulltext URL pointing to self", file=sys.stderr) + print(" WARN: returning fulltext URL pointing to self", file=sys.stderr) return self_doc_url return None @@ -864,7 +864,7 @@ def html_extract_resources(doc_url: str, doc: HTMLParser, # filter using adblocker resources = [ r for r in resources if adblock.check_network_urls( - r['url'], source_url=doc_url, request_type=r['type']) == False + r['url'], source_url=doc_url, request_type=r['type']) is False ] # remove duplicates diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index a8ce193..fe739bb 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -11,15 +11,14 @@ import sys import time import urllib.parse from collections import namedtuple +from http.client import IncompleteRead from typing import Tuple import requests import urllib3.exceptions # not sure this will really work. Should go before wayback imports. -http.client._MAXHEADERS = 1000 # type: ignore - -from http.client import IncompleteRead +http.client._MAXHEADERS = 1000 # noqa import wayback.exception from gwb.loader import CDXLoaderFactory3 @@ -128,18 +127,18 @@ def fuzzy_match_url(left, right): def test_fuzzy_match_url(): - assert fuzzy_match_url("http://thing.com", "http://thing.com") == True - assert fuzzy_match_url("http://thing.com", "https://thing.com") == True - assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True - assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True - assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True - assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True - assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False + assert fuzzy_match_url("http://thing.com", "http://thing.com") is True + assert fuzzy_match_url("http://thing.com", "https://thing.com") is True + assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True + assert fuzzy_match_url("http://thing.com", "http://thing.com/") is True + assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True + assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True + assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False # should probably handle these? - assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False - assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False - assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False + assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False + assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") is False + assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") is False class CdxApiError(Exception): @@ -951,7 +950,7 @@ class SavePageNowClient: resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, job_id)) try: resp.raise_for_status() - except: + except Exception: raise SavePageNowError(resp.content) status = resp.json()['status'] if status == 'pending': @@ -975,7 +974,7 @@ class SavePageNowClient: final_json['original_job_id'])) try: resp.raise_for_status() - except: + except Exception: raise SavePageNowError(resp.content) final_json = resp.json() diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index b480cc2..556e573 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -1,23 +1,19 @@ -import base64 -import gzip import json import sys import time import xml.etree.ElementTree -from collections import namedtuple -from http.server import BaseHTTPRequestHandler, HTTPServer -from typing import Any, Dict, List, Optional, Tuple +from http.server import BaseHTTPRequestHandler +from typing import Any, Dict, List, Optional -import requests from selectolax.parser import HTMLParser from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.grobid import GrobidClient from sandcrawler.html import extract_fulltext_url -from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio, - html_extract_resources, load_adblock_rules) -from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, - ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient, +from sandcrawler.html_metadata import (html_extract_biblio, html_extract_resources, + load_adblock_rules) +from sandcrawler.ia import (CdxApiError, NoCaptureError, PetaboxError, ResourceResult, + SavePageNowClient, SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, fix_transfer_encoding) from sandcrawler.ingest_html import (WebResource, fetch_html_resources, @@ -211,7 +207,7 @@ class IngestFileWorker(SandcrawlerWorker): return None existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url) # TODO: filter on more flags? - if existing and existing['hit'] == True: + if existing and existing['hit'] is True: return existing else: return None @@ -249,7 +245,7 @@ class IngestFileWorker(SandcrawlerWorker): if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000': old_failure = True - if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') + if self.try_spn2 and (resource is None or (resource and resource.status == 'no-capture') or soft404 or old_failure): via = "spn2" resource = self.spn_client.crawl_resource(url, self.wayback_client) @@ -751,7 +747,7 @@ class IngestFileWorker(SandcrawlerWorker): # fetch must be a hit if we got this far (though not necessarily an ingest hit!) assert resource - assert resource.hit == True + assert resource.hit is True assert resource.terminal_status_code in (200, 226) if resource.terminal_url: diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 5cbb908..4376c89 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -1,30 +1,19 @@ -import gzip import json import sys import time -from collections import namedtuple -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional import requests from selectolax.parser import HTMLParser -from sandcrawler.db import SandcrawlerPostgrestClient -from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper -from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy +from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE +from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError -from sandcrawler.html import extract_fulltext_url -from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio, - html_extract_resources, load_adblock_rules) -from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, - ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient, - WaybackContentError, WaybackError, cdx_to_dict, - fix_transfer_encoding) +from sandcrawler.html_metadata import html_extract_biblio +from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError, + WaybackError, cdx_to_dict, fix_transfer_encoding) from sandcrawler.ingest_file import IngestFileWorker -from sandcrawler.ingest_html import (WebResource, fetch_html_resources, - html_extract_body_teixml, html_guess_platform, - html_guess_scope, quick_fetch_html_resources) -from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime -from sandcrawler.workers import SandcrawlerWorker +from sandcrawler.misc import clean_url, gen_file_metadata MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024 @@ -61,7 +50,7 @@ class IngestFilesetWorker(IngestFileWorker): return None existing = self.pgrest_client.get_ingest_fileset_result(ingest_type, base_url) # TODO: filter on more flags? - if existing and existing['hit'] == True: + if existing and existing['hit'] is True: return existing else: return None @@ -196,7 +185,7 @@ class IngestFilesetWorker(IngestFileWorker): # fetch must be a hit if we got this far (though not necessarily an ingest hit!) assert resource - assert resource.hit == True + assert resource.hit is True assert resource.terminal_status_code in (200, 226) if resource.terminal_url: diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index bf25d5d..91e5c6e 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -1,6 +1,5 @@ import argparse import datetime -import io import json import sys import xml.etree.ElementTree as ET @@ -12,9 +11,9 @@ from selectolax.parser import HTMLParser from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules) -from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, - WaybackContentError, cdx_to_dict, fix_transfer_encoding) -from sandcrawler.misc import (clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, +from sandcrawler.ia import (CdxApiClient, NoCaptureError, WaybackClient, WaybackContentError, + cdx_to_dict, fix_transfer_encoding) +from sandcrawler.misc import (datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal) TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" @@ -147,7 +146,7 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, file=sys.stderr) if not cdx_row.status_code: # TODO: fall back to a full fetch? - print(f" WARN: skipping revisit record", file=sys.stderr) + print(" WARN: skipping revisit record", file=sys.stderr) continue full.append( WebResource( diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py index 188621f..046db9e 100644 --- a/python/sandcrawler/minio.py +++ b/python/sandcrawler/minio.py @@ -1,6 +1,5 @@ import hashlib import io -import os import minio diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index ddbd95a..5ca7a4b 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -35,10 +35,10 @@ def url_fuzzy_equal(left: str, right: str) -> bool: def test_url_fuzzy_equal() -> None: - assert True == url_fuzzy_equal( + assert url_fuzzy_equal( "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree" - ) + ) is True def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: @@ -239,8 +239,8 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]: def test_parse_cdx_datetime() -> None: - assert parse_cdx_datetime("") == None - assert parse_cdx_datetime("asdf") == None + assert parse_cdx_datetime("") is None + assert parse_cdx_datetime("asdf") is None assert parse_cdx_datetime("19930203123045") != None assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 190672d..9392136 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -293,7 +293,7 @@ def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExt return PdfExtractResult( sha1hex=sha1hex, status='bad-pdf', - error_msg=f"PDF known to cause processing issues", + error_msg="PDF known to cause processing issues", file_meta=file_meta, ) diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index e3d4a54..ba875cd 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -75,7 +75,6 @@ class PdfTrioWorker(SandcrawlerFetchWorker): def process(self, record, key=None): start_process = time.time() - default_key = record['sha1hex'] fetch_sec = None start = time.time() diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index 7135f4c..8c604fb 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -116,7 +116,6 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): self.wayback_client = wayback_client def fetch_blob(self, record): - start_process = time.time() default_key = record['sha1hex'] wayback_sec = None petabox_sec = None diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 3c76c17..3e35807 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -6,7 +6,6 @@ or S3 (SeaweedFS). """ import argparse -import datetime import os import sys @@ -18,7 +17,7 @@ from sandcrawler.persist import PersistHtmlTeiXmlWorker, PersistXmlDocWorker # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable try: git_sha = raven.fetch_git_sha('..') -except Exception as e: +except Exception: git_sha = None sentry_client = raven.Client(release=git_sha) diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index 55636dc..15d43fb 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -2,9 +2,9 @@ import struct import pytest import responses -from test_wayback import cdx_client, wayback_client +from test_wayback import cdx_client, wayback_client # noqa:F401 -from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker, WaybackClient +from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) @@ -58,7 +58,7 @@ def test_grobid_success(grobid_client): @responses.activate -def test_grobid_worker_cdx(grobid_client, wayback_client): +def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811 sink = BlackholeSink() worker = GrobidWorker(grobid_client, wayback_client, sink=sink) diff --git a/python/tests/test_html.py b/python/tests/test_html.py index c5f422e..1caca15 100644 --- a/python/tests/test_html.py +++ b/python/tests/test_html.py @@ -1,8 +1,3 @@ -import json - -import pytest -import responses - from sandcrawler.html import extract_fulltext_url diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py index 3bf94e2..727fef9 100644 --- a/python/tests/test_html_ingest.py +++ b/python/tests/test_html_ingest.py @@ -1,7 +1,3 @@ -import datetime - -import pytest - from sandcrawler.ingest_html import * diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 79f50f4..f2318c2 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -87,7 +87,7 @@ def test_ingest_success(ingest_worker_pdf): resp = ingest_worker_pdf.process(request) print(resp) - assert resp['hit'] == True + assert resp['hit'] is True assert resp['status'] == "success" assert resp['request'] == request assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex'] @@ -156,7 +156,7 @@ def test_ingest_landing(ingest_worker): resp = ingest_worker.process(request) print(resp) - assert resp['hit'] == False + assert resp['hit'] is False assert resp['status'] == "no-pdf-link" assert resp['request'] == request assert 'terminal' in resp @@ -179,7 +179,7 @@ def test_ingest_blocklist(ingest_worker): resp = ingest_worker.process(request) - assert resp['hit'] == False + assert resp['hit'] is False assert resp['status'] == "skip-url-blocklist" assert resp['request'] == request @@ -197,7 +197,7 @@ def test_ingest_wall_blocklist(ingest_worker): resp = ingest_worker.process(request) - assert resp['hit'] == False + assert resp['hit'] is False assert resp['status'] == "skip-wall" assert resp['request'] == request @@ -212,6 +212,6 @@ def test_ingest_cookie_blocklist(ingest_worker): resp = ingest_worker.process(request) - assert resp['hit'] == False + assert resp['hit'] is False assert resp['status'] == "blocked-cookie" assert resp['request'] == request diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index 0ff4902..bc74916 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -6,12 +6,9 @@ automatically in CI. Simply uncomment lines to run. """ -import json - import pytest -from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, - SavePageNowError, WaybackClient, WaybackError, gen_file_metadata) +from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata @pytest.fixture @@ -89,7 +86,7 @@ def test_lookup_resource_success(wayback_client): url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url in (url, url.replace("https://", "http://")) assert resp.cdx.url in (url, url.replace("https://", "http://")) @@ -139,7 +136,7 @@ def test_lookup_ftp(wayback_client): url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url == url assert resp.terminal_status_code == 226 @@ -154,7 +151,7 @@ def test_lookup_ftp(wayback_client): url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url == url assert resp.terminal_status_code == 226 @@ -171,10 +168,10 @@ def test_crawl_ftp(spn_client, wayback_client): resp = spn_client.crawl_resource(url, wayback_client) # FTP isn't supported yet! - #assert resp.hit == True + #assert resp.hit is True #assert resp.status == "success" #assert resp.terminal_url == url #assert resp.cdx.url == url - assert resp.hit == False + assert resp.hit is False assert resp.status == "spn2-no-ftp" diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index dcc1202..7d3e755 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -83,7 +83,7 @@ def test_invalid_cdx(): print("missing warc") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -" - assert parse_cdx_line(raw) == None + assert parse_cdx_line(raw) is None print("bad datetime") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py index 146b138..086243a 100644 --- a/python/tests/test_pdfextract.py +++ b/python/tests/test_pdfextract.py @@ -2,11 +2,9 @@ import struct import poppler import pytest -import responses -from test_wayback import cdx_client, wayback_client +from test_wayback import cdx_client, wayback_client # noqa:F401 -from sandcrawler import (BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, - WaybackClient) +from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker from sandcrawler.pdfextract import process_pdf FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) @@ -43,7 +41,7 @@ def test_process_dummy_pdf(): assert resp.pdf_extra['page_count'] == 1 -def test_pdfextract_worker_cdx(wayback_client): +def test_pdfextract_worker_cdx(wayback_client): # noqa: F811 sink = BlackholeSink() worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py index 63f90d3..353a560 100644 --- a/python/tests/test_pushers.py +++ b/python/tests/test_pushers.py @@ -1,5 +1,3 @@ -import pytest - from sandcrawler.workers import BlackholeSink, CdxLinePusher diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py index 80334d9..37f0bc9 100644 --- a/python/tests/test_savepagenow.py +++ b/python/tests/test_savepagenow.py @@ -120,7 +120,7 @@ def test_savepagenow_success(spn_client): assert len(responses.calls) == 4 - assert resp.success == True + assert resp.success is True assert resp.status == "success" assert resp.request_url == TARGET assert resp.terminal_url == TARGET + "/redirect" @@ -151,12 +151,12 @@ def test_savepagenow_remote_error(spn_client): assert len(responses.calls) == 3 - assert resp.success == False + assert resp.success is False assert resp.status == ERROR_BODY['status_ext'] assert resp.request_url == TARGET - assert resp.terminal_url == None - assert resp.terminal_dt == None - assert resp.resources == None + assert resp.terminal_url is None + assert resp.terminal_dt is None + assert resp.resources is None @responses.activate @@ -214,7 +214,7 @@ def test_crawl_resource(spn_client, wayback_client): assert len(responses.calls) == 5 - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.body == WARC_BODY assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32 diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 6ccf775..9861db2 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -3,7 +3,7 @@ import json import pytest import responses -from sandcrawler import CdxApiClient, CdxApiError, PetaboxError, WaybackClient, WaybackError +from sandcrawler import CdxApiClient, WaybackClient CDX_TARGET = "http://fatcat.wiki/" CDX_DT = "20180812220054" @@ -215,4 +215,4 @@ def test_lookup_resource_success(wayback_client): resp = wayback_client.lookup_resource(CDX_TARGET) - assert resp.hit == True + assert resp.hit is True |