diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-06 18:02:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:29 -0700 |
commit | f98f6226097ac34cf8a57ee09a4feea9171addfe (patch) | |
tree | 395922d7aabe0dcbed322b4955697bdd2fd67631 /python/sandcrawler/ingest_fileset.py | |
parent | 07e8a199766be77f4e89561d03e9b4e995ab7396 (diff) | |
download | sandcrawler-f98f6226097ac34cf8a57ee09a4feea9171addfe.tar.gz sandcrawler-f98f6226097ac34cf8a57ee09a4feea9171addfe.zip |
progress on web ingest strategy
Diffstat (limited to 'python/sandcrawler/ingest_fileset.py')
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 3b55793..3e782ed 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -246,6 +246,8 @@ class IngestFilesetWorker(IngestFileWorker): ### END COPYPASTA ### + # XXX: html_guess_platform() + # determine platform platform_helper = None for (helper_name, helper) in self.dataset_platform_helpers.items(): @@ -279,6 +281,7 @@ class IngestFilesetWorker(IngestFileWorker): ingest_strategy = platform_helper.chose_strategy(dataset_meta) result['ingest_strategy'] = ingest_strategy + print(f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}", file=sys.stderr) strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy) if not strategy_helper: @@ -296,16 +299,18 @@ class IngestFilesetWorker(IngestFileWorker): if result['status'].startswith('success'): result['hit'] = True - print("[SUCCESS {:>5}] file_count={} total_size={}".format( + print("[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format( ingest_type, result['file_count'], result['total_size'], + ingest_strategy, ), file=sys.stderr) else: - print("[FAIL {:>5}] status={} file_count={} total_size={}".format( + print("[FAIL {:>5}] status={} file_count={} total_size={} strategy={}".format( ingest_type, result['status'], result['file_count'], result['total_size'], + ingest_strategy, ), file=sys.stderr) return result |