diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 13:35:36 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 13:35:36 -0700 |
commit | 600ad67925a748200ddf21d5aeabd157d2bb3664 (patch) | |
tree | 89ae6bc24e6eb3821c03efd7d781430345c68aa0 /python/sandcrawler/ingest_file.py | |
parent | 05bd7cbcc62588e431c5efd533189e246b2a997e (diff) | |
download | sandcrawler-600ad67925a748200ddf21d5aeabd157d2bb3664.tar.gz sandcrawler-600ad67925a748200ddf21d5aeabd157d2bb3664.zip |
start handling trivial lint cleanups: unused imports, 'is None', etc
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 22 |
1 files changed, 9 insertions, 13 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index b480cc2..556e573 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -1,23 +1,19 @@ -import base64 -import gzip import json import sys import time import xml.etree.ElementTree -from collections import namedtuple -from http.server import BaseHTTPRequestHandler, HTTPServer -from typing import Any, Dict, List, Optional, Tuple +from http.server import BaseHTTPRequestHandler +from typing import Any, Dict, List, Optional -import requests from selectolax.parser import HTMLParser from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.grobid import GrobidClient from sandcrawler.html import extract_fulltext_url -from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio, - html_extract_resources, load_adblock_rules) -from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, - ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient, +from sandcrawler.html_metadata import (html_extract_biblio, html_extract_resources, + load_adblock_rules) +from sandcrawler.ia import (CdxApiError, NoCaptureError, PetaboxError, ResourceResult, + SavePageNowClient, SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, fix_transfer_encoding) from sandcrawler.ingest_html import (WebResource, fetch_html_resources, @@ -211,7 +207,7 @@ class IngestFileWorker(SandcrawlerWorker): return None existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url) # TODO: filter on more flags? - if existing and existing['hit'] == True: + if existing and existing['hit'] is True: return existing else: return None @@ -249,7 +245,7 @@ class IngestFileWorker(SandcrawlerWorker): if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000': old_failure = True - if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') + if self.try_spn2 and (resource is None or (resource and resource.status == 'no-capture') or soft404 or old_failure): via = "spn2" resource = self.spn_client.crawl_resource(url, self.wayback_client) @@ -751,7 +747,7 @@ class IngestFileWorker(SandcrawlerWorker): # fetch must be a hit if we got this far (though not necessarily an ingest hit!) assert resource - assert resource.hit == True + assert resource.hit is True assert resource.terminal_status_code in (200, 226) if resource.terminal_url: |