aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
commit05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
treeabcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/sandcrawler/fileset_strategies.py
parentf3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
downloadsandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
make fmt
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py68
1 files changed, 41 insertions, 27 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index c9f182c..6c25276 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -1,4 +1,3 @@
-
import gzip
import json
import os
@@ -10,15 +9,15 @@ from typing import Any, Dict, List, Optional, Tuple
import internetarchive
-from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
- PlatformScopeError)
+from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile,
+ FilesetPlatformItem, IngestStrategy, PlatformScopeError)
from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding
+from sandcrawler.ia import (ResourceResult, SavePageNowClient, WaybackClient,
+ fix_transfer_encoding)
from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
class FilesetIngestStrategy():
-
def __init__(self):
#self.ingest_strategy = 'unknown'
self.success_status = "success"
@@ -31,7 +30,6 @@ class FilesetIngestStrategy():
class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
-
def __init__(self, **kwargs):
super().__init__()
self.ingest_strategy = IngestStrategy.ArchiveorgFileset
@@ -61,7 +59,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
found = False
for existing in item_files:
if existing.name == wanted.path:
- if ((existing.sha1 and existing.sha1 == wanted.sha1) or (existing.md5 and existing.md5 == wanted.md5)) and existing.name == wanted.path and existing.size == wanted.size:
+ if ((existing.sha1 and existing.sha1 == wanted.sha1) or
+ (existing.md5 and existing.md5 == wanted.md5)
+ ) and existing.name == wanted.path and existing.size == wanted.size:
found = True
wanted.status = 'exists'
break
@@ -69,7 +69,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
wanted.status = 'mismatch-existing'
break
if not found:
- print(f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}", file=sys.stderr)
+ print(
+ f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}",
+ file=sys.stderr)
return None
return ArchiveStrategyResult(
ingest_strategy=self.ingest_strategy,
@@ -108,10 +110,11 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
if not os.path.exists(local_path):
print(f" downloading {m.path}", file=sys.stderr)
- with self.ia_session.get(m.platform_url, stream=True, allow_redirects=True) as r:
+ with self.ia_session.get(m.platform_url, stream=True,
+ allow_redirects=True) as r:
r.raise_for_status()
with open(local_path + '.partial', 'wb') as f:
- for chunk in r.iter_content(chunk_size=256*1024):
+ for chunk in r.iter_content(chunk_size=256 * 1024):
f.write(chunk)
os.rename(local_path + '.partial', local_path)
m.status = 'downloaded-local'
@@ -120,7 +123,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
print(f" verifying {m.path}", file=sys.stderr)
file_meta = gen_file_metadata_path(local_path, allow_empty=True)
- assert file_meta['size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}"
+ assert file_meta[
+ 'size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}"
if m.sha1:
assert file_meta['sha1hex'] == m.sha1
@@ -142,7 +146,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
if file_meta['mimetype'] != m.mimetype and file_meta['mimetype'] != 'text/plain':
# these 'tab-separated-values' from dataverse are just noise, don't log them
if m.mimetype != 'text/tab-separated-values':
- print(f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}", file=sys.stderr)
+ print(
+ f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}",
+ file=sys.stderr)
m.mimetype = file_meta['mimetype']
else:
m.mimetype = file_meta['mimetype']
@@ -158,7 +164,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
'remote_name': m.path,
})
- print(f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...", file=sys.stderr)
+ print(
+ f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
+ file=sys.stderr)
internetarchive.upload(
item.archiveorg_item_name,
files=item_files,
@@ -183,25 +191,26 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
return result
+
class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
"""
ArchiveorgFilesetStrategy currently works fine with individual files. Just
need to over-ride the ingest_strategy name.
"""
-
def __init__(self):
super().__init__()
self.ingest_strategy = IngestStrategy.ArchiveorgFileset
self.success_status = "success-file"
-class WebFilesetStrategy(FilesetIngestStrategy):
+class WebFilesetStrategy(FilesetIngestStrategy):
def __init__(self, **kwargs):
super().__init__()
self.ingest_strategy = IngestStrategy.WebFileset
self.wayback_client = WaybackClient()
self.try_spn2 = True
- self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
self.max_spn_manifest = 20
def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
@@ -218,23 +227,26 @@ class WebFilesetStrategy(FilesetIngestStrategy):
for m in item.manifest:
fetch_url = m.platform_url
if not fetch_url:
- raise NotImplementedError("require 'platform_url' for each file when doing Web fetching")
+ raise NotImplementedError(
+ "require 'platform_url' for each file when doing Web fetching")
via = "wayback"
resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
- if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
+ if self.try_spn2 and (resource == None or
+ (resource and resource.status == 'no-capture')):
if len(item.manifest) > self.max_spn_manifest:
m.status = 'too-much-spn'
continue
via = "spn2"
- resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=True)
+ resource = self.spn_client.crawl_resource(fetch_url,
+ self.wayback_client,
+ force_simple_get=True)
- print("[FETCH {:>6}] {} {}".format(
- via,
- (resource and resource.status),
- (resource and resource.terminal_url) or fetch_url),
- file=sys.stderr)
+ print("[FETCH {:>6}] {} {}".format(via, (resource and resource.status),
+ (resource and resource.terminal_url)
+ or fetch_url),
+ file=sys.stderr)
m.terminal_url = resource.terminal_url
m.terminal_dt = resource.terminal_dt
@@ -251,9 +263,11 @@ class WebFilesetStrategy(FilesetIngestStrategy):
file_meta, html_resource = fix_transfer_encoding(file_meta, resource)
if self.ingest_strategy == "web-file":
- file_file_meta = file_meta
+ file_file_meta = file_meta
- if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex']) or (m.sha1 and m.sha1 != file_meta['sha1hex']):
+ if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex']
+ ) or (m.sha1
+ and m.sha1 != file_meta['sha1hex']):
m.status = 'mismatch'
continue
@@ -280,8 +294,8 @@ class WebFilesetStrategy(FilesetIngestStrategy):
result.file_resource = file_resource
return result
-class WebFileStrategy(WebFilesetStrategy):
+class WebFileStrategy(WebFilesetStrategy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.ingest_strategy = IngestStrategy.WebFile