aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/fileset_platforms.py8
-rw-r--r--python/sandcrawler/ia.py4
-rw-r--r--python/sandcrawler/ingest_file.py4
-rw-r--r--python/sandcrawler/ingest_fileset.py8
-rw-r--r--python/sandcrawler/pdfextract.py2
-rw-r--r--python/sandcrawler/persist.py4
-rw-r--r--python/sandcrawler/workers.py16
-rw-r--r--python/tests/test_savepagenow.py2
-rw-r--r--python/tests/test_wayback.py1
9 files changed, 24 insertions, 25 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 2811100..c97e639 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -126,7 +126,7 @@ class DataverseHelper(FilesetPlatformHelper):
# TODO: could also do HTML platform detection or something?
components = urllib.parse.urlparse(url)
- platform_domain = components.netloc.split(':')[0].lower()
+ # platform_domain = components.netloc.split(':')[0].lower()
params = urllib.parse.parse_qs(components.query)
id_param = params.get('persistentId')
if not id_param:
@@ -134,7 +134,7 @@ class DataverseHelper(FilesetPlatformHelper):
platform_id = id_param[0]
try:
- parsed = self.parse_dataverse_persistentid(platform_id)
+ self.parse_dataverse_persistentid(platform_id)
except ValueError:
return False
@@ -411,7 +411,7 @@ class FigshareHelper(FilesetPlatformHelper):
resp.raise_for_status()
obj = resp.json()
- _figshare_type = obj['defined_type_name']
+ # figshare_type = obj['defined_type_name']
if not obj['is_public']:
raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}')
@@ -552,7 +552,7 @@ class ZenodoHelper(FilesetPlatformHelper):
raise PlatformScopeError(
"got a work-level zenodo record, not a versioned record: {work_id}")
- zenodo_type = obj['metadata']['resource_type']['type']
+ # zenodo_type = obj['metadata']['resource_type']['type']
if obj['metadata']['access_right'] != 'open':
raise PlatformRestrictedError(
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index fe739bb..9d990bf 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -18,7 +18,7 @@ import requests
import urllib3.exceptions
# not sure this will really work. Should go before wayback imports.
-http.client._MAXHEADERS = 1000 # noqa
+http.client._MAXHEADERS = 1000 # type: ignore
import wayback.exception
from gwb.loader import CDXLoaderFactory3
@@ -1153,7 +1153,7 @@ class SavePageNowClient:
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- except (WaybackError, WaybackContentError) as we:
+ except (WaybackError, WaybackContentError):
return ResourceResult(
start_url=start_url,
hit=False,
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 556e573..bc8643b 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -399,8 +399,8 @@ class IngestFileWorker(SandcrawlerWorker):
assert resource.body
try:
html_doc = HTMLParser(resource.body)
- except ValueError as ve:
- return dict(status="html-selectolax-error", )
+ except ValueError:
+ return dict(status="html-selectolax-error")
html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
assert html_biblio
html_body = html_extract_body_teixml(resource.body)
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 4376c89..ea34948 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -110,7 +110,7 @@ class IngestFilesetWorker(IngestFileWorker):
result['status'] = 'wayback-content-error'
result['error_message'] = str(e)[:1600]
return result
- except NotImplementedError as e:
+ except NotImplementedError:
#result['status'] = 'not-implemented'
#result['error_message'] = str(e)[:1600]
#return result
@@ -269,10 +269,6 @@ class IngestFilesetWorker(IngestFileWorker):
return result
# 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
- terminal_url = base_url
- if resource:
- terminal_url = resource.terminal_url
-
try:
dataset_meta = platform_helper.process_request(request, resource, html_biblio)
except PlatformScopeError as e:
@@ -363,7 +359,7 @@ class IngestFilesetWorker(IngestFileWorker):
if ingest_strategy.endswith('-file'):
result['fileset_file'] = dict()
if archive_result.file_file_meta:
- result['fileset_file']['file_meta'] = file_meta = archive_result.file_file_meta,
+ result['fileset_file']['file_meta'] = archive_result.file_file_meta,
if archive_result.file_resource:
result['fileset_file']['terminal'] = dict(
terminal_url=archive_result.file_resource.terminal_url,
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 9392136..222a408 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -434,8 +434,6 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
)
def process(self, record, key: Optional[str] = None):
- default_key = record['sha1hex']
-
fetch_result = self.fetch_blob(record)
if fetch_result['status'] != 'success':
return fetch_result
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 44c03f2..b714bc7 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -578,7 +578,7 @@ class PersistThumbnailWorker(SandcrawlerWorker):
assert isinstance(blob, bytes)
assert len(blob) >= 50
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder=self.s3_folder,
blob=blob,
sha1hex=key,
@@ -619,7 +619,7 @@ class GenericPersistDocWorker(SandcrawlerWorker):
if 'sha1hex' in record:
assert key_str == record['sha1hex']
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder=self.s3_folder,
blob=record[self.doc_key].encode('utf-8'),
sha1hex=key_str,
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 8c604fb..6b08f03 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -181,12 +181,16 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
key=default_key,
source=record,
status="empty-blob",
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
)
return dict(
key=default_key,
status="success",
source=record,
blob=blob,
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
)
@@ -219,9 +223,9 @@ class MultiprocessWrapper(SandcrawlerWorker):
self.pool.terminate()
if self.sink:
self.sink.finish()
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("Multiprocessing: {}".format(self.counts), file=sys.stderr)
- return worker_counts
+ return self.counts
class BlackholeSink(SandcrawlerWorker):
@@ -370,7 +374,7 @@ class JsonLinePusher(RecordPusher):
self.worker.push_batch(batch)
self.counts['pushed'] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("JSON lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
@@ -417,7 +421,7 @@ class CdxLinePusher(RecordPusher):
self.worker.push_batch(batch)
self.counts['pushed'] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("CDX lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
@@ -456,7 +460,7 @@ class ZipfilePusher(RecordPusher):
self.worker.push_batch(batch)
self.counts['pushed'] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
@@ -552,7 +556,7 @@ class KafkaJsonPusher(RecordPusher):
# TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
# commit the current batch if it has been lingering
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("KafkaJson lines pushed: {}".format(self.counts), file=sys.stderr)
self.consumer.close()
return self.counts
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 37f0bc9..50cabb4 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -175,7 +175,7 @@ def test_savepagenow_500(spn_client):
body=json.dumps(ERROR_BODY))
with pytest.raises(SavePageNowError):
- resp = spn_client.save_url_now_v2(TARGET)
+ spn_client.save_url_now_v2(TARGET)
assert len(responses.calls) == 2
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 9861db2..0cb59fa 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -117,6 +117,7 @@ def test_cdx_fetch_errors(cdx_client):
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
assert len(responses.calls) == 3
+ assert resp
@responses.activate