diff options
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 8 | ||||
-rw-r--r-- | python/sandcrawler/ia.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 8 | ||||
-rw-r--r-- | python/sandcrawler/pdfextract.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/workers.py | 16 | ||||
-rw-r--r-- | python/tests/test_savepagenow.py | 2 | ||||
-rw-r--r-- | python/tests/test_wayback.py | 1 |
9 files changed, 24 insertions, 25 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 2811100..c97e639 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -126,7 +126,7 @@ class DataverseHelper(FilesetPlatformHelper): # TODO: could also do HTML platform detection or something? components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() + # platform_domain = components.netloc.split(':')[0].lower() params = urllib.parse.parse_qs(components.query) id_param = params.get('persistentId') if not id_param: @@ -134,7 +134,7 @@ class DataverseHelper(FilesetPlatformHelper): platform_id = id_param[0] try: - parsed = self.parse_dataverse_persistentid(platform_id) + self.parse_dataverse_persistentid(platform_id) except ValueError: return False @@ -411,7 +411,7 @@ class FigshareHelper(FilesetPlatformHelper): resp.raise_for_status() obj = resp.json() - _figshare_type = obj['defined_type_name'] + # figshare_type = obj['defined_type_name'] if not obj['is_public']: raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}') @@ -552,7 +552,7 @@ class ZenodoHelper(FilesetPlatformHelper): raise PlatformScopeError( "got a work-level zenodo record, not a versioned record: {work_id}") - zenodo_type = obj['metadata']['resource_type']['type'] + # zenodo_type = obj['metadata']['resource_type']['type'] if obj['metadata']['access_right'] != 'open': raise PlatformRestrictedError( diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index fe739bb..9d990bf 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -18,7 +18,7 @@ import requests import urllib3.exceptions # not sure this will really work. Should go before wayback imports. -http.client._MAXHEADERS = 1000 # noqa +http.client._MAXHEADERS = 1000 # type: ignore import wayback.exception from gwb.loader import CDXLoaderFactory3 @@ -1153,7 +1153,7 @@ class SavePageNowClient: url=cdx_row.url, datetime=cdx_row.datetime, ) - except (WaybackError, WaybackContentError) as we: + except (WaybackError, WaybackContentError): return ResourceResult( start_url=start_url, hit=False, diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 556e573..bc8643b 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -399,8 +399,8 @@ class IngestFileWorker(SandcrawlerWorker): assert resource.body try: html_doc = HTMLParser(resource.body) - except ValueError as ve: - return dict(status="html-selectolax-error", ) + except ValueError: + return dict(status="html-selectolax-error") html_biblio = html_extract_biblio(resource.terminal_url, html_doc) assert html_biblio html_body = html_extract_body_teixml(resource.body) diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 4376c89..ea34948 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -110,7 +110,7 @@ class IngestFilesetWorker(IngestFileWorker): result['status'] = 'wayback-content-error' result['error_message'] = str(e)[:1600] return result - except NotImplementedError as e: + except NotImplementedError: #result['status'] = 'not-implemented' #result['error_message'] = str(e)[:1600] #return result @@ -269,10 +269,6 @@ class IngestFilesetWorker(IngestFileWorker): return result # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`. - terminal_url = base_url - if resource: - terminal_url = resource.terminal_url - try: dataset_meta = platform_helper.process_request(request, resource, html_biblio) except PlatformScopeError as e: @@ -363,7 +359,7 @@ class IngestFilesetWorker(IngestFileWorker): if ingest_strategy.endswith('-file'): result['fileset_file'] = dict() if archive_result.file_file_meta: - result['fileset_file']['file_meta'] = file_meta = archive_result.file_file_meta, + result['fileset_file']['file_meta'] = archive_result.file_file_meta, if archive_result.file_resource: result['fileset_file']['terminal'] = dict( terminal_url=archive_result.file_resource.terminal_url, diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 9392136..222a408 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -434,8 +434,6 @@ class PdfExtractWorker(SandcrawlerFetchWorker): ) def process(self, record, key: Optional[str] = None): - default_key = record['sha1hex'] - fetch_result = self.fetch_blob(record) if fetch_result['status'] != 'success': return fetch_result diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 44c03f2..b714bc7 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -578,7 +578,7 @@ class PersistThumbnailWorker(SandcrawlerWorker): assert isinstance(blob, bytes) assert len(blob) >= 50 - resp = self.s3.put_blob( + self.s3.put_blob( folder=self.s3_folder, blob=blob, sha1hex=key, @@ -619,7 +619,7 @@ class GenericPersistDocWorker(SandcrawlerWorker): if 'sha1hex' in record: assert key_str == record['sha1hex'] - resp = self.s3.put_blob( + self.s3.put_blob( folder=self.s3_folder, blob=record[self.doc_key].encode('utf-8'), sha1hex=key_str, diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index 8c604fb..6b08f03 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -181,12 +181,16 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): key=default_key, source=record, status="empty-blob", + wayback_sec=wayback_sec, + petabox_sec=petabox_sec, ) return dict( key=default_key, status="success", source=record, blob=blob, + wayback_sec=wayback_sec, + petabox_sec=petabox_sec, ) @@ -219,9 +223,9 @@ class MultiprocessWrapper(SandcrawlerWorker): self.pool.terminate() if self.sink: self.sink.finish() - worker_counts = self.worker.finish() + self.worker.finish() print("Multiprocessing: {}".format(self.counts), file=sys.stderr) - return worker_counts + return self.counts class BlackholeSink(SandcrawlerWorker): @@ -370,7 +374,7 @@ class JsonLinePusher(RecordPusher): self.worker.push_batch(batch) self.counts['pushed'] += len(batch) batch = [] - worker_counts = self.worker.finish() + self.worker.finish() print("JSON lines pushed: {}".format(self.counts), file=sys.stderr) return self.counts @@ -417,7 +421,7 @@ class CdxLinePusher(RecordPusher): self.worker.push_batch(batch) self.counts['pushed'] += len(batch) batch = [] - worker_counts = self.worker.finish() + self.worker.finish() print("CDX lines pushed: {}".format(self.counts), file=sys.stderr) return self.counts @@ -456,7 +460,7 @@ class ZipfilePusher(RecordPusher): self.worker.push_batch(batch) self.counts['pushed'] += len(batch) batch = [] - worker_counts = self.worker.finish() + self.worker.finish() print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr) return self.counts @@ -552,7 +556,7 @@ class KafkaJsonPusher(RecordPusher): # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or # commit the current batch if it has been lingering - worker_counts = self.worker.finish() + self.worker.finish() print("KafkaJson lines pushed: {}".format(self.counts), file=sys.stderr) self.consumer.close() return self.counts diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py index 37f0bc9..50cabb4 100644 --- a/python/tests/test_savepagenow.py +++ b/python/tests/test_savepagenow.py @@ -175,7 +175,7 @@ def test_savepagenow_500(spn_client): body=json.dumps(ERROR_BODY)) with pytest.raises(SavePageNowError): - resp = spn_client.save_url_now_v2(TARGET) + spn_client.save_url_now_v2(TARGET) assert len(responses.calls) == 2 diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 9861db2..0cb59fa 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -117,6 +117,7 @@ def test_cdx_fetch_errors(cdx_client): resp = cdx_client.fetch(CDX_TARGET, CDX_DT) assert len(responses.calls) == 3 + assert resp @responses.activate |