more small fileset ingest tweaks

author: Bryan Newbold <bnewbold@archive.org> 2021-10-26 11:50:26 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-26 11:50:26 -0700
commit: f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c (patch)
tree: 2baa3cb10971bf3549df04266c2b4e8e3a879650 /python/sandcrawler
parent: 12d041b781912dc376444198c920ade2d6cee7c8 (diff)
download: sandcrawler-f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c.tar.gz
sandcrawler-f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c.zip
2 files changed, 21 insertions, 6 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index cc07948..134ae7c 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -54,6 +54,7 @@ class FilesetPlatformHelper():
 class DataverseHelper(FilesetPlatformHelper):
 
     def __init__(self):
+        super().__init__()
         self.platform_name = 'dataverse'
         self.session = requests.Session()
 
@@ -324,6 +325,7 @@ def test_parse_dataverse_persistentid():
 class FigshareHelper(FilesetPlatformHelper):
 
     def __init__(self):
+        super().__init__()
         self.platform_name = 'figshare'
         self.session = requests.Session()
 
@@ -337,15 +339,20 @@ class FigshareHelper(FilesetPlatformHelper):
         Raises a ValueError if not a figshare URL
         """
         # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1
+        #     /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4
 
         comp = path.split('/')
         if len(comp) < 4 or comp[1] != 'articles':
             raise ValueError(f"not a figshare URL: {path}")
 
-        if len(comp) == 5 and comp[3].isdigit() and comp[4].isdigit():
-            return (comp[3], comp[4])
-        elif len(comp) == 4 and comp[3].isdigit():
-            return (comp[3], None)
+        comp = comp[2:]
+        if comp[0] in ['dataset',]:
+            comp = comp[1:]
+
+        if len(comp) == 3 and comp[1].isdigit() and comp[2].isdigit():
+            return (comp[1], comp[2])
+        elif len(comp) == 2 and comp[1].isdigit():
+            return (comp[1], None)
         else:
             raise ValueError(f"couldn't find figshare identiier: {path}")
 
@@ -455,6 +462,7 @@ def test_parse_figshare_url_path():
         "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": ("8987858", "1"),
         "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": ("8987858", None),
         "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
+        "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": ("12127176", "4"),
     }
 
     invalid = [
@@ -474,6 +482,7 @@ def test_parse_figshare_url_path():
 class ZenodoHelper(FilesetPlatformHelper):
 
     def __init__(self):
+        super().__init__()
         self.platform_name = 'zenodo'
         self.session = requests.Session()
 
@@ -619,6 +628,7 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
     }
 
     def __init__(self):
+        super().__init__()
         self.platform_name = 'archiveorg'
         self.session = internetarchive.get_session()
 
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index f2f2fcc..d12fc15 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -32,6 +32,7 @@ class FilesetIngestStrategy():
 class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
 
     def __init__(self, **kwargs):
+        super().__init__()
         self.ingest_strategy = IngestStrategy.ArchiveorgFileset
 
         # TODO: enable cleanup when confident (eg, safe path parsing)
@@ -195,10 +196,12 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
 class WebFilesetStrategy(FilesetIngestStrategy):
 
     def __init__(self, **kwargs):
+        super().__init__()
         self.ingest_strategy = IngestStrategy.WebFileset
         self.wayback_client = WaybackClient()
         self.try_spn2 = True
         self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+        self.max_spn_manifest = 20
 
     def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
         """
@@ -219,10 +222,12 @@ class WebFilesetStrategy(FilesetIngestStrategy):
             via = "wayback"
             resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
 
-
             if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
+                if len(item.manifest) > self.max_spn_manifest:
+                    m.status = 'too-much-spn'
+                    continue
                 via = "spn2"
-                resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client)
+                resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=True)
 
             print("[FETCH {:>6}] {}  {}".format(
                     via,
author	Bryan Newbold <bnewbold@archive.org>	2021-10-26 11:50:26 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-26 11:50:26 -0700
commit	f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c (patch)
tree	2baa3cb10971bf3549df04266c2b4e8e3a879650 /python/sandcrawler
parent	12d041b781912dc376444198c920ade2d6cee7c8 (diff)
download	sandcrawler-f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c.tar.gz sandcrawler-f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c.zip