aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
commit826c7538e091fac14d987a3cd654975da964e240 (patch)
tree90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/fileset_strategies.py
parent020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
downloadsandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
make fmt (black 21.9b0)
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py171
1 files changed, 100 insertions, 71 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 9d3bae3..6dc77f9 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -5,15 +5,19 @@ from typing import Optional
import internetarchive
-from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetPlatformItem,
- IngestStrategy, PlatformScopeError)
+from sandcrawler.fileset_types import (
+ ArchiveStrategyResult,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformScopeError,
+)
from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
-class FilesetIngestStrategy():
+class FilesetIngestStrategy:
def __init__(self):
- #self.ingest_strategy = 'unknown'
+ # self.ingest_strategy = 'unknown'
self.success_status = "success"
def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
@@ -29,8 +33,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
self.ingest_strategy = IngestStrategy.ArchiveorgFileset
# TODO: enable cleanup when confident (eg, safe path parsing)
- self.skip_cleanup_local_files = kwargs.get('skip_cleanup_local_files', True)
- self.working_dir = os.environ.get('SANDCRAWLER_WORKING_DIR', '/tmp/sandcrawler/')
+ self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True)
+ self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/")
try:
os.mkdir(self.working_dir)
except FileExistsError:
@@ -53,23 +57,29 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
found = False
for existing in item_files:
if existing.name == wanted.path:
- if ((existing.sha1 and existing.sha1 == wanted.sha1) or
- (existing.md5 and existing.md5 == wanted.md5)
- ) and existing.name == wanted.path and existing.size == wanted.size:
+ if (
+ (
+ (existing.sha1 and existing.sha1 == wanted.sha1)
+ or (existing.md5 and existing.md5 == wanted.md5)
+ )
+ and existing.name == wanted.path
+ and existing.size == wanted.size
+ ):
found = True
- wanted.status = 'exists'
+ wanted.status = "exists"
break
else:
- wanted.status = 'mismatch-existing'
+ wanted.status = "mismatch-existing"
break
if not found:
print(
f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}",
- file=sys.stderr)
+ file=sys.stderr,
+ )
return None
return ArchiveStrategyResult(
ingest_strategy=self.ingest_strategy,
- status='success-existing',
+ status="success-existing",
manifest=item.manifest,
)
@@ -81,12 +91,12 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
if existing:
return existing
- if item.platform_name == 'archiveorg':
+ if item.platform_name == "archiveorg":
raise PlatformScopeError("should't download archive.org into itself")
local_dir = self.working_dir + item.archiveorg_item_name
- assert local_dir.startswith('/')
- assert local_dir.count('/') > 2
+ assert local_dir.startswith("/")
+ assert local_dir.count("/") > 2
try:
os.mkdir(local_dir)
except FileExistsError:
@@ -96,71 +106,80 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
assert item.manifest
for m in item.manifest:
if m.path != sanitize_fs_path(m.path):
- m.status = 'unsafe-path'
+ m.status = "unsafe-path"
continue
- local_path = local_dir + '/' + m.path
+ local_path = local_dir + "/" + m.path
assert m.platform_url
if not os.path.exists(local_path):
print(f" downloading {m.path}", file=sys.stderr)
- with self.ia_session.get(m.platform_url, stream=True,
- allow_redirects=True) as r:
+ with self.ia_session.get(
+ m.platform_url, stream=True, allow_redirects=True
+ ) as r:
r.raise_for_status()
- with open(local_path + '.partial', 'wb') as f:
+ with open(local_path + ".partial", "wb") as f:
for chunk in r.iter_content(chunk_size=256 * 1024):
f.write(chunk)
- os.rename(local_path + '.partial', local_path)
- m.status = 'downloaded-local'
+ os.rename(local_path + ".partial", local_path)
+ m.status = "downloaded-local"
else:
- m.status = 'exists-local'
+ m.status = "exists-local"
print(f" verifying {m.path}", file=sys.stderr)
file_meta = gen_file_metadata_path(local_path, allow_empty=True)
- assert file_meta[
- 'size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}"
+ assert (
+ file_meta["size_bytes"] == m.size
+ ), f"expected: {m.size} found: {file_meta['size_bytes']}"
if m.sha1:
- assert file_meta['sha1hex'] == m.sha1
+ assert file_meta["sha1hex"] == m.sha1
else:
- m.sha1 = file_meta['sha1hex']
+ m.sha1 = file_meta["sha1hex"]
if m.sha256:
- assert file_meta['sha256hex'] == m.sha256
+ assert file_meta["sha256hex"] == m.sha256
else:
- m.sha256 = file_meta['sha256hex']
+ m.sha256 = file_meta["sha256hex"]
if m.md5:
- assert file_meta['md5hex'] == m.md5
+ assert file_meta["md5hex"] == m.md5
else:
- m.md5 = file_meta['md5hex']
+ m.md5 = file_meta["md5hex"]
if m.mimetype:
# 'magic' isn't good and parsing more detailed text file formats like text/csv
- if file_meta['mimetype'] != m.mimetype and file_meta['mimetype'] != 'text/plain':
+ if (
+ file_meta["mimetype"] != m.mimetype
+ and file_meta["mimetype"] != "text/plain"
+ ):
# these 'tab-separated-values' from dataverse are just noise, don't log them
- if m.mimetype != 'text/tab-separated-values':
+ if m.mimetype != "text/tab-separated-values":
print(
f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}",
- file=sys.stderr)
- m.mimetype = file_meta['mimetype']
+ file=sys.stderr,
+ )
+ m.mimetype = file_meta["mimetype"]
else:
- m.mimetype = file_meta['mimetype']
- m.status = 'verified-local'
+ m.mimetype = file_meta["mimetype"]
+ m.status = "verified-local"
# 2. upload all files, with metadata
- assert item.archiveorg_item_meta and item.archiveorg_item_meta['collection']
+ assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
item_files = []
for m in item.manifest:
- local_path = local_dir + '/' + m.path
- item_files.append({
- 'name': local_path,
- 'remote_name': m.path,
- })
+ local_path = local_dir + "/" + m.path
+ item_files.append(
+ {
+ "name": local_path,
+ "remote_name": m.path,
+ }
+ )
print(
f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
- file=sys.stderr)
+ file=sys.stderr,
+ )
internetarchive.upload(
item.archiveorg_item_name,
files=item_files,
@@ -171,7 +190,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
)
for m in item.manifest:
- m.status = 'success'
+ m.status = "success"
# 4. delete local directory
if not self.skip_cleanup_local_files:
@@ -191,6 +210,7 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
ArchiveorgFilesetStrategy currently works fine with individual files. Just
need to over-ride the ingest_strategy name.
"""
+
def __init__(self):
super().__init__()
self.ingest_strategy = IngestStrategy.ArchiveorgFileset
@@ -204,7 +224,8 @@ class WebFilesetStrategy(FilesetIngestStrategy):
self.wayback_client = WaybackClient()
self.try_spn2 = True
self.spn_client = SavePageNowClient(
- spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
self.max_spn_manifest = 20
def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
@@ -222,25 +243,31 @@ class WebFilesetStrategy(FilesetIngestStrategy):
fetch_url = m.platform_url
if not fetch_url:
raise NotImplementedError(
- "require 'platform_url' for each file when doing Web fetching")
+ "require 'platform_url' for each file when doing Web fetching"
+ )
via = "wayback"
resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
- if self.try_spn2 and (resource is None or
- (resource and resource.status == 'no-capture')):
+ if self.try_spn2 and (
+ resource is None or (resource and resource.status == "no-capture")
+ ):
if len(item.manifest) > self.max_spn_manifest:
- m.status = 'too-much-spn'
+ m.status = "too-much-spn"
continue
via = "spn2"
- resource = self.spn_client.crawl_resource(fetch_url,
- self.wayback_client,
- force_simple_get=True)
-
- print("[FETCH {:>6}] {} {}".format(via, (resource and resource.status),
- (resource and resource.terminal_url)
- or fetch_url),
- file=sys.stderr)
+ resource = self.spn_client.crawl_resource(
+ fetch_url, self.wayback_client, force_simple_get=True
+ )
+
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via,
+ (resource and resource.status),
+ (resource and resource.terminal_url) or fetch_url,
+ ),
+ file=sys.stderr,
+ )
m.terminal_url = resource.terminal_url
m.terminal_dt = resource.terminal_dt
@@ -248,7 +275,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
if self.ingest_strategy == "web-file":
file_resource = resource
- if resource.status != 'success':
+ if resource.status != "success":
continue
else:
assert resource.terminal_status_code == 200
@@ -259,24 +286,26 @@ class WebFilesetStrategy(FilesetIngestStrategy):
if self.ingest_strategy == "web-file":
file_file_meta = file_meta
- if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex']
- ) or (m.sha1
- and m.sha1 != file_meta['sha1hex']):
- m.status = 'mismatch'
+ if (
+ file_meta["size_bytes"] != m.size
+ or (m.md5 and m.md5 != file_meta["md5hex"])
+ or (m.sha1 and m.sha1 != file_meta["sha1hex"])
+ ):
+ m.status = "mismatch"
continue
- m.md5 = m.md5 or file_meta['md5hex']
- m.sha1 = m.sha1 or file_meta['md5hex']
- m.sha256 = m.sha256 or file_meta['sha256hex']
- m.mimetype = m.mimetype or file_meta['mimetype']
+ m.md5 = m.md5 or file_meta["md5hex"]
+ m.sha1 = m.sha1 or file_meta["md5hex"]
+ m.sha256 = m.sha256 or file_meta["sha256hex"]
+ m.mimetype = m.mimetype or file_meta["mimetype"]
overall_status = self.success_status
for m in item.manifest:
- if m.status != 'success':
- overall_status = m.status or 'not-processed'
+ if m.status != "success":
+ overall_status = m.status or "not-processed"
break
if not item.manifest:
- overall_status = 'empty-manifest'
+ overall_status = "empty-manifest"
result = ArchiveStrategyResult(
ingest_strategy=self.ingest_strategy,