aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 11:50:26 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 11:50:26 -0700
commitf9a263a1c0155fc59386fc36f7f4ce25dfc7b23c (patch)
tree2baa3cb10971bf3549df04266c2b4e8e3a879650 /python/sandcrawler
parent12d041b781912dc376444198c920ade2d6cee7c8 (diff)
downloadsandcrawler-f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c.tar.gz
sandcrawler-f9a263a1c0155fc59386fc36f7f4ce25dfc7b23c.zip
more small fileset ingest tweaks
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/fileset_platforms.py18
-rw-r--r--python/sandcrawler/fileset_strategies.py9
2 files changed, 21 insertions, 6 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index cc07948..134ae7c 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -54,6 +54,7 @@ class FilesetPlatformHelper():
class DataverseHelper(FilesetPlatformHelper):
def __init__(self):
+ super().__init__()
self.platform_name = 'dataverse'
self.session = requests.Session()
@@ -324,6 +325,7 @@ def test_parse_dataverse_persistentid():
class FigshareHelper(FilesetPlatformHelper):
def __init__(self):
+ super().__init__()
self.platform_name = 'figshare'
self.session = requests.Session()
@@ -337,15 +339,20 @@ class FigshareHelper(FilesetPlatformHelper):
Raises a ValueError if not a figshare URL
"""
# eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1
+ # /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4
comp = path.split('/')
if len(comp) < 4 or comp[1] != 'articles':
raise ValueError(f"not a figshare URL: {path}")
- if len(comp) == 5 and comp[3].isdigit() and comp[4].isdigit():
- return (comp[3], comp[4])
- elif len(comp) == 4 and comp[3].isdigit():
- return (comp[3], None)
+ comp = comp[2:]
+ if comp[0] in ['dataset',]:
+ comp = comp[1:]
+
+ if len(comp) == 3 and comp[1].isdigit() and comp[2].isdigit():
+ return (comp[1], comp[2])
+ elif len(comp) == 2 and comp[1].isdigit():
+ return (comp[1], None)
else:
raise ValueError(f"couldn't find figshare identiier: {path}")
@@ -455,6 +462,7 @@ def test_parse_figshare_url_path():
"/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": ("8987858", "1"),
"/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": ("8987858", None),
"/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
+ "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": ("12127176", "4"),
}
invalid = [
@@ -474,6 +482,7 @@ def test_parse_figshare_url_path():
class ZenodoHelper(FilesetPlatformHelper):
def __init__(self):
+ super().__init__()
self.platform_name = 'zenodo'
self.session = requests.Session()
@@ -619,6 +628,7 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
}
def __init__(self):
+ super().__init__()
self.platform_name = 'archiveorg'
self.session = internetarchive.get_session()
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index f2f2fcc..d12fc15 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -32,6 +32,7 @@ class FilesetIngestStrategy():
class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
def __init__(self, **kwargs):
+ super().__init__()
self.ingest_strategy = IngestStrategy.ArchiveorgFileset
# TODO: enable cleanup when confident (eg, safe path parsing)
@@ -195,10 +196,12 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
class WebFilesetStrategy(FilesetIngestStrategy):
def __init__(self, **kwargs):
+ super().__init__()
self.ingest_strategy = IngestStrategy.WebFileset
self.wayback_client = WaybackClient()
self.try_spn2 = True
self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+ self.max_spn_manifest = 20
def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
"""
@@ -219,10 +222,12 @@ class WebFilesetStrategy(FilesetIngestStrategy):
via = "wayback"
resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
-
if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
+ if len(item.manifest) > self.max_spn_manifest:
+ m.status = 'too-much-spn'
+ continue
via = "spn2"
- resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client)
+ resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=True)
print("[FETCH {:>6}] {} {}".format(
via,