aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_fileset.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-06 18:02:41 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commitf98f6226097ac34cf8a57ee09a4feea9171addfe (patch)
tree395922d7aabe0dcbed322b4955697bdd2fd67631 /python/sandcrawler/ingest_fileset.py
parent07e8a199766be77f4e89561d03e9b4e995ab7396 (diff)
downloadsandcrawler-f98f6226097ac34cf8a57ee09a4feea9171addfe.tar.gz
sandcrawler-f98f6226097ac34cf8a57ee09a4feea9171addfe.zip
progress on web ingest strategy
Diffstat (limited to 'python/sandcrawler/ingest_fileset.py')
-rw-r--r--python/sandcrawler/ingest_fileset.py9
1 files changed, 7 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 3b55793..3e782ed 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -246,6 +246,8 @@ class IngestFilesetWorker(IngestFileWorker):
### END COPYPASTA ###
+ # XXX: html_guess_platform()
+
# determine platform
platform_helper = None
for (helper_name, helper) in self.dataset_platform_helpers.items():
@@ -279,6 +281,7 @@ class IngestFilesetWorker(IngestFileWorker):
ingest_strategy = platform_helper.chose_strategy(dataset_meta)
result['ingest_strategy'] = ingest_strategy
+ print(f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}", file=sys.stderr)
strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy)
if not strategy_helper:
@@ -296,16 +299,18 @@ class IngestFilesetWorker(IngestFileWorker):
if result['status'].startswith('success'):
result['hit'] = True
- print("[SUCCESS {:>5}] file_count={} total_size={}".format(
+ print("[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format(
ingest_type,
result['file_count'],
result['total_size'],
+ ingest_strategy,
), file=sys.stderr)
else:
- print("[FAIL {:>5}] status={} file_count={} total_size={}".format(
+ print("[FAIL {:>5}] status={} file_count={} total_size={} strategy={}".format(
ingest_type,
result['status'],
result['file_count'],
result['total_size'],
+ ingest_strategy,
), file=sys.stderr)
return result