diff options
Diffstat (limited to 'python/sandcrawler/fileset_platforms.py')
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 20 |
1 files changed, 18 insertions, 2 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 07d9844..5c13318 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -43,7 +43,7 @@ class FilesetPlatformHelper: def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy: assert item.manifest - total_size = sum([m.size for m in item.manifest]) or 0 + total_size = sum([m.size or 0 for m in item.manifest]) or 0 largest_size = max([m.size or 0 for m in item.manifest]) or 0 if len(item.manifest) == 1: if total_size < 64 * 1024 * 1024: @@ -375,6 +375,11 @@ class FigshareHelper(FilesetPlatformHelper): comp = comp[2:] if comp[0] in [ "dataset", + # TODO: should the following be considered "out of scope"? + "journal_contribution", + "presentation", + "poster", + "thesis", ]: comp = comp[1:] @@ -472,7 +477,10 @@ class FigshareHelper(FilesetPlatformHelper): # extra=dict(), ) ) - assert not row.get("is_link_only") + if row.get("is_link_only"): + raise PlatformScopeError( + f"figshare.org file is just a link (not a file): {row['name']} at {row['download_url']}" + ) authors = [] for author in obj["authors"]: @@ -521,6 +529,14 @@ def test_parse_figshare_url_path() -> None: "12127176", "4", ), + "/articles/journal_contribution/Improved_Time_Resolved_Measurements_of_Inorganic_Ions_in_Particulate_Matter_by_PILS_IC_Integrated_with_a_Sample_Pre_Concentration_System/1407386/3": ( + "1407386", + "3", + ), + "/articles/poster/Effect_of_nanoclay_loading_on_the_thermal_decomposition_of_nanoclay_polyurethane_elastomers_obtained_by_bulk_polymerization/1094056/1": ( + "1094056", + "1", + ), } invalid = [ |