diff options
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 68 |
1 files changed, 41 insertions, 27 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index c9f182c..6c25276 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -1,4 +1,3 @@ - import gzip import json import os @@ -10,15 +9,15 @@ from typing import Any, Dict, List, Optional, Tuple import internetarchive -from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy, - PlatformScopeError) +from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, + FilesetPlatformItem, IngestStrategy, PlatformScopeError) from sandcrawler.html_metadata import BiblioMetadata -from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding +from sandcrawler.ia import (ResourceResult, SavePageNowClient, WaybackClient, + fix_transfer_encoding) from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path class FilesetIngestStrategy(): - def __init__(self): #self.ingest_strategy = 'unknown' self.success_status = "success" @@ -31,7 +30,6 @@ class FilesetIngestStrategy(): class ArchiveorgFilesetStrategy(FilesetIngestStrategy): - def __init__(self, **kwargs): super().__init__() self.ingest_strategy = IngestStrategy.ArchiveorgFileset @@ -61,7 +59,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): found = False for existing in item_files: if existing.name == wanted.path: - if ((existing.sha1 and existing.sha1 == wanted.sha1) or (existing.md5 and existing.md5 == wanted.md5)) and existing.name == wanted.path and existing.size == wanted.size: + if ((existing.sha1 and existing.sha1 == wanted.sha1) or + (existing.md5 and existing.md5 == wanted.md5) + ) and existing.name == wanted.path and existing.size == wanted.size: found = True wanted.status = 'exists' break @@ -69,7 +69,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): wanted.status = 'mismatch-existing' break if not found: - print(f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}", file=sys.stderr) + print( + f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}", + file=sys.stderr) return None return ArchiveStrategyResult( ingest_strategy=self.ingest_strategy, @@ -108,10 +110,11 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): if not os.path.exists(local_path): print(f" downloading {m.path}", file=sys.stderr) - with self.ia_session.get(m.platform_url, stream=True, allow_redirects=True) as r: + with self.ia_session.get(m.platform_url, stream=True, + allow_redirects=True) as r: r.raise_for_status() with open(local_path + '.partial', 'wb') as f: - for chunk in r.iter_content(chunk_size=256*1024): + for chunk in r.iter_content(chunk_size=256 * 1024): f.write(chunk) os.rename(local_path + '.partial', local_path) m.status = 'downloaded-local' @@ -120,7 +123,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): print(f" verifying {m.path}", file=sys.stderr) file_meta = gen_file_metadata_path(local_path, allow_empty=True) - assert file_meta['size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}" + assert file_meta[ + 'size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}" if m.sha1: assert file_meta['sha1hex'] == m.sha1 @@ -142,7 +146,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): if file_meta['mimetype'] != m.mimetype and file_meta['mimetype'] != 'text/plain': # these 'tab-separated-values' from dataverse are just noise, don't log them if m.mimetype != 'text/tab-separated-values': - print(f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}", file=sys.stderr) + print( + f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}", + file=sys.stderr) m.mimetype = file_meta['mimetype'] else: m.mimetype = file_meta['mimetype'] @@ -158,7 +164,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): 'remote_name': m.path, }) - print(f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...", file=sys.stderr) + print( + f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...", + file=sys.stderr) internetarchive.upload( item.archiveorg_item_name, files=item_files, @@ -183,25 +191,26 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): return result + class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy): """ ArchiveorgFilesetStrategy currently works fine with individual files. Just need to over-ride the ingest_strategy name. """ - def __init__(self): super().__init__() self.ingest_strategy = IngestStrategy.ArchiveorgFileset self.success_status = "success-file" -class WebFilesetStrategy(FilesetIngestStrategy): +class WebFilesetStrategy(FilesetIngestStrategy): def __init__(self, **kwargs): super().__init__() self.ingest_strategy = IngestStrategy.WebFileset self.wayback_client = WaybackClient() self.try_spn2 = True - self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) + self.spn_client = SavePageNowClient( + spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) self.max_spn_manifest = 20 def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult: @@ -218,23 +227,26 @@ class WebFilesetStrategy(FilesetIngestStrategy): for m in item.manifest: fetch_url = m.platform_url if not fetch_url: - raise NotImplementedError("require 'platform_url' for each file when doing Web fetching") + raise NotImplementedError( + "require 'platform_url' for each file when doing Web fetching") via = "wayback" resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype) - if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')): + if self.try_spn2 and (resource == None or + (resource and resource.status == 'no-capture')): if len(item.manifest) > self.max_spn_manifest: m.status = 'too-much-spn' continue via = "spn2" - resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=True) + resource = self.spn_client.crawl_resource(fetch_url, + self.wayback_client, + force_simple_get=True) - print("[FETCH {:>6}] {} {}".format( - via, - (resource and resource.status), - (resource and resource.terminal_url) or fetch_url), - file=sys.stderr) + print("[FETCH {:>6}] {} {}".format(via, (resource and resource.status), + (resource and resource.terminal_url) + or fetch_url), + file=sys.stderr) m.terminal_url = resource.terminal_url m.terminal_dt = resource.terminal_dt @@ -251,9 +263,11 @@ class WebFilesetStrategy(FilesetIngestStrategy): file_meta, html_resource = fix_transfer_encoding(file_meta, resource) if self.ingest_strategy == "web-file": - file_file_meta = file_meta + file_file_meta = file_meta - if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex']) or (m.sha1 and m.sha1 != file_meta['sha1hex']): + if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex'] + ) or (m.sha1 + and m.sha1 != file_meta['sha1hex']): m.status = 'mismatch' continue @@ -280,8 +294,8 @@ class WebFilesetStrategy(FilesetIngestStrategy): result.file_resource = file_resource return result -class WebFileStrategy(WebFilesetStrategy): +class WebFileStrategy(WebFilesetStrategy): def __init__(self, **kwargs): super().__init__(**kwargs) self.ingest_strategy = IngestStrategy.WebFile |