diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:50:17 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:50:17 -0700 |
commit | 826c7538e091fac14d987a3cd654975da964e240 (patch) | |
tree | 90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/fileset_strategies.py | |
parent | 020037d4714e7ba2ab172c7278494aed0b2148ad (diff) | |
download | sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip |
make fmt (black 21.9b0)
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 171 |
1 files changed, 100 insertions, 71 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 9d3bae3..6dc77f9 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -5,15 +5,19 @@ from typing import Optional import internetarchive -from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetPlatformItem, - IngestStrategy, PlatformScopeError) +from sandcrawler.fileset_types import ( + ArchiveStrategyResult, + FilesetPlatformItem, + IngestStrategy, + PlatformScopeError, +) from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path -class FilesetIngestStrategy(): +class FilesetIngestStrategy: def __init__(self): - #self.ingest_strategy = 'unknown' + # self.ingest_strategy = 'unknown' self.success_status = "success" def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]: @@ -29,8 +33,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): self.ingest_strategy = IngestStrategy.ArchiveorgFileset # TODO: enable cleanup when confident (eg, safe path parsing) - self.skip_cleanup_local_files = kwargs.get('skip_cleanup_local_files', True) - self.working_dir = os.environ.get('SANDCRAWLER_WORKING_DIR', '/tmp/sandcrawler/') + self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True) + self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/") try: os.mkdir(self.working_dir) except FileExistsError: @@ -53,23 +57,29 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): found = False for existing in item_files: if existing.name == wanted.path: - if ((existing.sha1 and existing.sha1 == wanted.sha1) or - (existing.md5 and existing.md5 == wanted.md5) - ) and existing.name == wanted.path and existing.size == wanted.size: + if ( + ( + (existing.sha1 and existing.sha1 == wanted.sha1) + or (existing.md5 and existing.md5 == wanted.md5) + ) + and existing.name == wanted.path + and existing.size == wanted.size + ): found = True - wanted.status = 'exists' + wanted.status = "exists" break else: - wanted.status = 'mismatch-existing' + wanted.status = "mismatch-existing" break if not found: print( f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}", - file=sys.stderr) + file=sys.stderr, + ) return None return ArchiveStrategyResult( ingest_strategy=self.ingest_strategy, - status='success-existing', + status="success-existing", manifest=item.manifest, ) @@ -81,12 +91,12 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): if existing: return existing - if item.platform_name == 'archiveorg': + if item.platform_name == "archiveorg": raise PlatformScopeError("should't download archive.org into itself") local_dir = self.working_dir + item.archiveorg_item_name - assert local_dir.startswith('/') - assert local_dir.count('/') > 2 + assert local_dir.startswith("/") + assert local_dir.count("/") > 2 try: os.mkdir(local_dir) except FileExistsError: @@ -96,71 +106,80 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): assert item.manifest for m in item.manifest: if m.path != sanitize_fs_path(m.path): - m.status = 'unsafe-path' + m.status = "unsafe-path" continue - local_path = local_dir + '/' + m.path + local_path = local_dir + "/" + m.path assert m.platform_url if not os.path.exists(local_path): print(f" downloading {m.path}", file=sys.stderr) - with self.ia_session.get(m.platform_url, stream=True, - allow_redirects=True) as r: + with self.ia_session.get( + m.platform_url, stream=True, allow_redirects=True + ) as r: r.raise_for_status() - with open(local_path + '.partial', 'wb') as f: + with open(local_path + ".partial", "wb") as f: for chunk in r.iter_content(chunk_size=256 * 1024): f.write(chunk) - os.rename(local_path + '.partial', local_path) - m.status = 'downloaded-local' + os.rename(local_path + ".partial", local_path) + m.status = "downloaded-local" else: - m.status = 'exists-local' + m.status = "exists-local" print(f" verifying {m.path}", file=sys.stderr) file_meta = gen_file_metadata_path(local_path, allow_empty=True) - assert file_meta[ - 'size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}" + assert ( + file_meta["size_bytes"] == m.size + ), f"expected: {m.size} found: {file_meta['size_bytes']}" if m.sha1: - assert file_meta['sha1hex'] == m.sha1 + assert file_meta["sha1hex"] == m.sha1 else: - m.sha1 = file_meta['sha1hex'] + m.sha1 = file_meta["sha1hex"] if m.sha256: - assert file_meta['sha256hex'] == m.sha256 + assert file_meta["sha256hex"] == m.sha256 else: - m.sha256 = file_meta['sha256hex'] + m.sha256 = file_meta["sha256hex"] if m.md5: - assert file_meta['md5hex'] == m.md5 + assert file_meta["md5hex"] == m.md5 else: - m.md5 = file_meta['md5hex'] + m.md5 = file_meta["md5hex"] if m.mimetype: # 'magic' isn't good and parsing more detailed text file formats like text/csv - if file_meta['mimetype'] != m.mimetype and file_meta['mimetype'] != 'text/plain': + if ( + file_meta["mimetype"] != m.mimetype + and file_meta["mimetype"] != "text/plain" + ): # these 'tab-separated-values' from dataverse are just noise, don't log them - if m.mimetype != 'text/tab-separated-values': + if m.mimetype != "text/tab-separated-values": print( f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}", - file=sys.stderr) - m.mimetype = file_meta['mimetype'] + file=sys.stderr, + ) + m.mimetype = file_meta["mimetype"] else: - m.mimetype = file_meta['mimetype'] - m.status = 'verified-local' + m.mimetype = file_meta["mimetype"] + m.status = "verified-local" # 2. upload all files, with metadata - assert item.archiveorg_item_meta and item.archiveorg_item_meta['collection'] + assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"] item_files = [] for m in item.manifest: - local_path = local_dir + '/' + m.path - item_files.append({ - 'name': local_path, - 'remote_name': m.path, - }) + local_path = local_dir + "/" + m.path + item_files.append( + { + "name": local_path, + "remote_name": m.path, + } + ) print( f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...", - file=sys.stderr) + file=sys.stderr, + ) internetarchive.upload( item.archiveorg_item_name, files=item_files, @@ -171,7 +190,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): ) for m in item.manifest: - m.status = 'success' + m.status = "success" # 4. delete local directory if not self.skip_cleanup_local_files: @@ -191,6 +210,7 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy): ArchiveorgFilesetStrategy currently works fine with individual files. Just need to over-ride the ingest_strategy name. """ + def __init__(self): super().__init__() self.ingest_strategy = IngestStrategy.ArchiveorgFileset @@ -204,7 +224,8 @@ class WebFilesetStrategy(FilesetIngestStrategy): self.wayback_client = WaybackClient() self.try_spn2 = True self.spn_client = SavePageNowClient( - spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) + spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0) + ) self.max_spn_manifest = 20 def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult: @@ -222,25 +243,31 @@ class WebFilesetStrategy(FilesetIngestStrategy): fetch_url = m.platform_url if not fetch_url: raise NotImplementedError( - "require 'platform_url' for each file when doing Web fetching") + "require 'platform_url' for each file when doing Web fetching" + ) via = "wayback" resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype) - if self.try_spn2 and (resource is None or - (resource and resource.status == 'no-capture')): + if self.try_spn2 and ( + resource is None or (resource and resource.status == "no-capture") + ): if len(item.manifest) > self.max_spn_manifest: - m.status = 'too-much-spn' + m.status = "too-much-spn" continue via = "spn2" - resource = self.spn_client.crawl_resource(fetch_url, - self.wayback_client, - force_simple_get=True) - - print("[FETCH {:>6}] {} {}".format(via, (resource and resource.status), - (resource and resource.terminal_url) - or fetch_url), - file=sys.stderr) + resource = self.spn_client.crawl_resource( + fetch_url, self.wayback_client, force_simple_get=True + ) + + print( + "[FETCH {:>6}] {} {}".format( + via, + (resource and resource.status), + (resource and resource.terminal_url) or fetch_url, + ), + file=sys.stderr, + ) m.terminal_url = resource.terminal_url m.terminal_dt = resource.terminal_dt @@ -248,7 +275,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): if self.ingest_strategy == "web-file": file_resource = resource - if resource.status != 'success': + if resource.status != "success": continue else: assert resource.terminal_status_code == 200 @@ -259,24 +286,26 @@ class WebFilesetStrategy(FilesetIngestStrategy): if self.ingest_strategy == "web-file": file_file_meta = file_meta - if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex'] - ) or (m.sha1 - and m.sha1 != file_meta['sha1hex']): - m.status = 'mismatch' + if ( + file_meta["size_bytes"] != m.size + or (m.md5 and m.md5 != file_meta["md5hex"]) + or (m.sha1 and m.sha1 != file_meta["sha1hex"]) + ): + m.status = "mismatch" continue - m.md5 = m.md5 or file_meta['md5hex'] - m.sha1 = m.sha1 or file_meta['md5hex'] - m.sha256 = m.sha256 or file_meta['sha256hex'] - m.mimetype = m.mimetype or file_meta['mimetype'] + m.md5 = m.md5 or file_meta["md5hex"] + m.sha1 = m.sha1 or file_meta["md5hex"] + m.sha256 = m.sha256 or file_meta["sha256hex"] + m.mimetype = m.mimetype or file_meta["mimetype"] overall_status = self.success_status for m in item.manifest: - if m.status != 'success': - overall_status = m.status or 'not-processed' + if m.status != "success": + overall_status = m.status or "not-processed" break if not item.manifest: - overall_status = 'empty-manifest' + overall_status = "empty-manifest" result = ArchiveStrategyResult( ingest_strategy=self.ingest_strategy, |