aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py76
1 files changed, 49 insertions, 27 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index b0131f4..f83d1ce 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -4,6 +4,7 @@ import sys
from typing import Optional
import internetarchive
+import requests
from sandcrawler.fileset_types import (
ArchiveStrategyResult,
@@ -12,7 +13,12 @@ from sandcrawler.fileset_types import (
PlatformScopeError,
)
from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
-from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
+from sandcrawler.misc import (
+ gen_file_metadata,
+ gen_file_metadata_path,
+ requests_retry_session,
+ sanitize_fs_path,
+)
class FilesetIngestStrategy:
@@ -40,12 +46,15 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
except FileExistsError:
pass
- self.ia_session = internetarchive.get_session(config={
- 's3': {
- 'access': os.environ.get("IA_ACCESS_KEY"),
- 'secret': os.environ.get("IA_SECRET_KEY"),
- },
- })
+ self.http_session = requests_retry_session()
+ self.ia_session = internetarchive.get_session(
+ config={
+ "s3": {
+ "access": os.environ.get("IA_ACCESS_KEY"),
+ "secret": os.environ.get("IA_SECRET_KEY"),
+ },
+ }
+ )
def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
"""
@@ -119,22 +128,28 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
if not os.path.exists(os.path.dirname(local_path)):
os.mkdir(os.path.dirname(local_path))
- if not os.path.exists(local_path):
+ if os.path.exists(local_path):
+ m.status = "exists-local"
+ else:
print(f" downloading {m.path}", file=sys.stderr)
# create any sub-directories for this path, if necessary
if not os.path.exists(os.path.dirname(local_path)):
os.mkdir(os.path.dirname(local_path))
- with self.ia_session.get(
- m.platform_url, stream=True, allow_redirects=True
- ) as r:
- r.raise_for_status()
- with open(local_path + ".partial", "wb") as f:
- for chunk in r.iter_content(chunk_size=256 * 1024):
- f.write(chunk)
- os.rename(local_path + ".partial", local_path)
- m.status = "downloaded-local"
- else:
- m.status = "exists-local"
+ try:
+ with self.http_session.get(
+ m.platform_url,
+ stream=True,
+ allow_redirects=True,
+ timeout=2 * 60 * 60,
+ ) as r:
+ r.raise_for_status()
+ with open(local_path + ".partial", "wb") as f:
+ for chunk in r.iter_content(chunk_size=256 * 1024):
+ f.write(chunk)
+ os.rename(local_path + ".partial", local_path)
+ m.status = "downloaded-local"
+ except requests.exceptions.RequestException:
+ m.status = "error-platform-download"
print(f" verifying {m.path}", file=sys.stderr)
file_meta = gen_file_metadata_path(local_path, allow_empty=True)
@@ -190,14 +205,21 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
file=sys.stderr,
)
- internetarchive.upload(
- item.archiveorg_item_name,
- files=item_files,
- metadata=item.archiveorg_item_meta,
- checksum=True,
- queue_derive=False,
- verify=True,
- )
+ try:
+ internetarchive.upload(
+ item.archiveorg_item_name,
+ files=item_files,
+ metadata=item.archiveorg_item_meta,
+ checksum=True,
+ queue_derive=False,
+ verify=True,
+ )
+ except requests.exceptions.RequestException:
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-archiveorg-upload",
+ )
for m in item.manifest:
m.status = "success"