make fmt (black 21.9b0)

author: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:50:17 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:50:17 -0700
commit: 826c7538e091fac14d987a3cd654975da964e240 (patch)
tree: 90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/fileset_strategies.py
parent: 020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
download: sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
1 files changed, 100 insertions, 71 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 9d3bae3..6dc77f9 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -5,15 +5,19 @@ from typing import Optional
 
 import internetarchive
 
-from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetPlatformItem,
-                                       IngestStrategy, PlatformScopeError)
+from sandcrawler.fileset_types import (
+    ArchiveStrategyResult,
+    FilesetPlatformItem,
+    IngestStrategy,
+    PlatformScopeError,
+)
 from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
 from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
 
 
-class FilesetIngestStrategy():
+class FilesetIngestStrategy:
     def __init__(self):
-        #self.ingest_strategy = 'unknown'
+        # self.ingest_strategy = 'unknown'
         self.success_status = "success"
 
     def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
@@ -29,8 +33,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
         self.ingest_strategy = IngestStrategy.ArchiveorgFileset
 
         # TODO: enable cleanup when confident (eg, safe path parsing)
-        self.skip_cleanup_local_files = kwargs.get('skip_cleanup_local_files', True)
-        self.working_dir = os.environ.get('SANDCRAWLER_WORKING_DIR', '/tmp/sandcrawler/')
+        self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True)
+        self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/")
         try:
             os.mkdir(self.working_dir)
         except FileExistsError:
@@ -53,23 +57,29 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
             found = False
             for existing in item_files:
                 if existing.name == wanted.path:
-                    if ((existing.sha1 and existing.sha1 == wanted.sha1) or
-                        (existing.md5 and existing.md5 == wanted.md5)
-                        ) and existing.name == wanted.path and existing.size == wanted.size:
+                    if (
+                        (
+                            (existing.sha1 and existing.sha1 == wanted.sha1)
+                            or (existing.md5 and existing.md5 == wanted.md5)
+                        )
+                        and existing.name == wanted.path
+                        and existing.size == wanted.size
+                    ):
                         found = True
-                        wanted.status = 'exists'
+                        wanted.status = "exists"
                         break
                     else:
-                        wanted.status = 'mismatch-existing'
+                        wanted.status = "mismatch-existing"
                         break
             if not found:
                 print(
                     f"  item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}",
-                    file=sys.stderr)
+                    file=sys.stderr,
+                )
                 return None
         return ArchiveStrategyResult(
             ingest_strategy=self.ingest_strategy,
-            status='success-existing',
+            status="success-existing",
             manifest=item.manifest,
         )
 
@@ -81,12 +91,12 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
         if existing:
             return existing
 
-        if item.platform_name == 'archiveorg':
+        if item.platform_name == "archiveorg":
             raise PlatformScopeError("should't download archive.org into itself")
 
         local_dir = self.working_dir + item.archiveorg_item_name
-        assert local_dir.startswith('/')
-        assert local_dir.count('/') > 2
+        assert local_dir.startswith("/")
+        assert local_dir.count("/") > 2
         try:
             os.mkdir(local_dir)
         except FileExistsError:
@@ -96,71 +106,80 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
         assert item.manifest
         for m in item.manifest:
             if m.path != sanitize_fs_path(m.path):
-                m.status = 'unsafe-path'
+                m.status = "unsafe-path"
                 continue
 
-            local_path = local_dir + '/' + m.path
+            local_path = local_dir + "/" + m.path
             assert m.platform_url
 
             if not os.path.exists(local_path):
                 print(f"  downloading {m.path}", file=sys.stderr)
-                with self.ia_session.get(m.platform_url, stream=True,
-                                         allow_redirects=True) as r:
+                with self.ia_session.get(
+                    m.platform_url, stream=True, allow_redirects=True
+                ) as r:
                     r.raise_for_status()
-                    with open(local_path + '.partial', 'wb') as f:
+                    with open(local_path + ".partial", "wb") as f:
                         for chunk in r.iter_content(chunk_size=256 * 1024):
                             f.write(chunk)
-                os.rename(local_path + '.partial', local_path)
-                m.status = 'downloaded-local'
+                os.rename(local_path + ".partial", local_path)
+                m.status = "downloaded-local"
             else:
-                m.status = 'exists-local'
+                m.status = "exists-local"
 
             print(f"  verifying {m.path}", file=sys.stderr)
             file_meta = gen_file_metadata_path(local_path, allow_empty=True)
-            assert file_meta[
-                'size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}"
+            assert (
+                file_meta["size_bytes"] == m.size
+            ), f"expected: {m.size} found: {file_meta['size_bytes']}"
 
             if m.sha1:
-                assert file_meta['sha1hex'] == m.sha1
+                assert file_meta["sha1hex"] == m.sha1
             else:
-                m.sha1 = file_meta['sha1hex']
+                m.sha1 = file_meta["sha1hex"]
 
             if m.sha256:
-                assert file_meta['sha256hex'] == m.sha256
+                assert file_meta["sha256hex"] == m.sha256
             else:
-                m.sha256 = file_meta['sha256hex']
+                m.sha256 = file_meta["sha256hex"]
 
             if m.md5:
-                assert file_meta['md5hex'] == m.md5
+                assert file_meta["md5hex"] == m.md5
             else:
-                m.md5 = file_meta['md5hex']
+                m.md5 = file_meta["md5hex"]
 
             if m.mimetype:
                 # 'magic' isn't good and parsing more detailed text file formats like text/csv
-                if file_meta['mimetype'] != m.mimetype and file_meta['mimetype'] != 'text/plain':
+                if (
+                    file_meta["mimetype"] != m.mimetype
+                    and file_meta["mimetype"] != "text/plain"
+                ):
                     # these 'tab-separated-values' from dataverse are just noise, don't log them
-                    if m.mimetype != 'text/tab-separated-values':
+                    if m.mimetype != "text/tab-separated-values":
                         print(
                             f"  WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}",
-                            file=sys.stderr)
-                    m.mimetype = file_meta['mimetype']
+                            file=sys.stderr,
+                        )
+                    m.mimetype = file_meta["mimetype"]
             else:
-                m.mimetype = file_meta['mimetype']
-            m.status = 'verified-local'
+                m.mimetype = file_meta["mimetype"]
+            m.status = "verified-local"
 
         # 2. upload all files, with metadata
-        assert item.archiveorg_item_meta and item.archiveorg_item_meta['collection']
+        assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
         item_files = []
         for m in item.manifest:
-            local_path = local_dir + '/' + m.path
-            item_files.append({
-                'name': local_path,
-                'remote_name': m.path,
-            })
+            local_path = local_dir + "/" + m.path
+            item_files.append(
+                {
+                    "name": local_path,
+                    "remote_name": m.path,
+                }
+            )
 
         print(
             f"  uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
-            file=sys.stderr)
+            file=sys.stderr,
+        )
         internetarchive.upload(
             item.archiveorg_item_name,
             files=item_files,
@@ -171,7 +190,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
         )
 
         for m in item.manifest:
-            m.status = 'success'
+            m.status = "success"
 
         # 4. delete local directory
         if not self.skip_cleanup_local_files:
@@ -191,6 +210,7 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
     ArchiveorgFilesetStrategy currently works fine with individual files. Just
     need to over-ride the ingest_strategy name.
     """
+
     def __init__(self):
         super().__init__()
         self.ingest_strategy = IngestStrategy.ArchiveorgFileset
@@ -204,7 +224,8 @@ class WebFilesetStrategy(FilesetIngestStrategy):
         self.wayback_client = WaybackClient()
         self.try_spn2 = True
         self.spn_client = SavePageNowClient(
-            spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+            spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+        )
         self.max_spn_manifest = 20
 
     def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
@@ -222,25 +243,31 @@ class WebFilesetStrategy(FilesetIngestStrategy):
             fetch_url = m.platform_url
             if not fetch_url:
                 raise NotImplementedError(
-                    "require 'platform_url' for each file when doing Web fetching")
+                    "require 'platform_url' for each file when doing Web fetching"
+                )
 
             via = "wayback"
             resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
 
-            if self.try_spn2 and (resource is None or
-                                  (resource and resource.status == 'no-capture')):
+            if self.try_spn2 and (
+                resource is None or (resource and resource.status == "no-capture")
+            ):
                 if len(item.manifest) > self.max_spn_manifest:
-                    m.status = 'too-much-spn'
+                    m.status = "too-much-spn"
                     continue
                 via = "spn2"
-                resource = self.spn_client.crawl_resource(fetch_url,
-                                                          self.wayback_client,
-                                                          force_simple_get=True)
-
-            print("[FETCH {:>6}] {}  {}".format(via, (resource and resource.status),
-                                                (resource and resource.terminal_url)
-                                                or fetch_url),
-                  file=sys.stderr)
+                resource = self.spn_client.crawl_resource(
+                    fetch_url, self.wayback_client, force_simple_get=True
+                )
+
+            print(
+                "[FETCH {:>6}] {}  {}".format(
+                    via,
+                    (resource and resource.status),
+                    (resource and resource.terminal_url) or fetch_url,
+                ),
+                file=sys.stderr,
+            )
 
             m.terminal_url = resource.terminal_url
             m.terminal_dt = resource.terminal_dt
@@ -248,7 +275,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
             if self.ingest_strategy == "web-file":
                 file_resource = resource
 
-            if resource.status != 'success':
+            if resource.status != "success":
                 continue
             else:
                 assert resource.terminal_status_code == 200
@@ -259,24 +286,26 @@ class WebFilesetStrategy(FilesetIngestStrategy):
             if self.ingest_strategy == "web-file":
                 file_file_meta = file_meta
 
-            if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex']
-                                                     ) or (m.sha1
-                                                           and m.sha1 != file_meta['sha1hex']):
-                m.status = 'mismatch'
+            if (
+                file_meta["size_bytes"] != m.size
+                or (m.md5 and m.md5 != file_meta["md5hex"])
+                or (m.sha1 and m.sha1 != file_meta["sha1hex"])
+            ):
+                m.status = "mismatch"
                 continue
 
-            m.md5 = m.md5 or file_meta['md5hex']
-            m.sha1 = m.sha1 or file_meta['md5hex']
-            m.sha256 = m.sha256 or file_meta['sha256hex']
-            m.mimetype = m.mimetype or file_meta['mimetype']
+            m.md5 = m.md5 or file_meta["md5hex"]
+            m.sha1 = m.sha1 or file_meta["md5hex"]
+            m.sha256 = m.sha256 or file_meta["sha256hex"]
+            m.mimetype = m.mimetype or file_meta["mimetype"]
 
         overall_status = self.success_status
         for m in item.manifest:
-            if m.status != 'success':
-                overall_status = m.status or 'not-processed'
+            if m.status != "success":
+                overall_status = m.status or "not-processed"
                 break
         if not item.manifest:
-            overall_status = 'empty-manifest'
+            overall_status = "empty-manifest"
 
         result = ArchiveStrategyResult(
             ingest_strategy=self.ingest_strategy,
author	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:50:17 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:50:17 -0700
commit	826c7538e091fac14d987a3cd654975da964e240 (patch)
tree	90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/fileset_strategies.py
parent	020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
download	sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip