aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_platforms.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/fileset_platforms.py')
-rw-r--r--python/sandcrawler/fileset_platforms.py20
1 files changed, 18 insertions, 2 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 07d9844..5c13318 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -43,7 +43,7 @@ class FilesetPlatformHelper:
def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
assert item.manifest
- total_size = sum([m.size for m in item.manifest]) or 0
+ total_size = sum([m.size or 0 for m in item.manifest]) or 0
largest_size = max([m.size or 0 for m in item.manifest]) or 0
if len(item.manifest) == 1:
if total_size < 64 * 1024 * 1024:
@@ -375,6 +375,11 @@ class FigshareHelper(FilesetPlatformHelper):
comp = comp[2:]
if comp[0] in [
"dataset",
+ # TODO: should the following be considered "out of scope"?
+ "journal_contribution",
+ "presentation",
+ "poster",
+ "thesis",
]:
comp = comp[1:]
@@ -472,7 +477,10 @@ class FigshareHelper(FilesetPlatformHelper):
# extra=dict(),
)
)
- assert not row.get("is_link_only")
+ if row.get("is_link_only"):
+ raise PlatformScopeError(
+ f"figshare.org file is just a link (not a file): {row['name']} at {row['download_url']}"
+ )
authors = []
for author in obj["authors"]:
@@ -521,6 +529,14 @@ def test_parse_figshare_url_path() -> None:
"12127176",
"4",
),
+ "/articles/journal_contribution/Improved_Time_Resolved_Measurements_of_Inorganic_Ions_in_Particulate_Matter_by_PILS_IC_Integrated_with_a_Sample_Pre_Concentration_System/1407386/3": (
+ "1407386",
+ "3",
+ ),
+ "/articles/poster/Effect_of_nanoclay_loading_on_the_thermal_decomposition_of_nanoclay_polyurethane_elastomers_obtained_by_bulk_polymerization/1094056/1": (
+ "1094056",
+ "1",
+ ),
}
invalid = [