diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 18:15:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 18:15:33 -0700 |
commit | 4c0e3f9c02692cd2cab0657d7fbcd1861a586076 (patch) | |
tree | d3a06b0425d4e3477ee3381b23407e92e6c46f45 /python | |
parent | 41fae4c294e2ba43370b4a4193c0f6107201dbf0 (diff) | |
download | sandcrawler-4c0e3f9c02692cd2cab0657d7fbcd1861a586076.tar.gz sandcrawler-4c0e3f9c02692cd2cab0657d7fbcd1861a586076.zip |
update 'XXX' notes from fileset ingest development
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 13 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 2 |
2 files changed, 6 insertions, 9 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index b6808b5..6ab4781 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -32,9 +32,6 @@ class FilesetPlatformHelper(): assert item.manifest total_size = sum([m.size for m in item.manifest]) or 0 largest_size = max([m.size or 0 for m in item.manifest]) or 0 - #print(f" total_size={total_size} largest_size={largest_size}", file=sys.stderr) - # XXX: while developing ArchiveorgFileset path - #return IngestStrategy.ArchiveorgFileset if len(item.manifest) == 1: if total_size < 64 * 1024 * 1024: return IngestStrategy.WebFile @@ -174,7 +171,7 @@ class DataverseHelper(FilesetPlatformHelper): raise PlatformScopeError("not actually in scope") if parsed_id['file_id']: - # XXX: maybe we could support this? + # TODO: maybe we could support this? raise PlatformScopeError( "only entire dataverse datasets can be archived with this tool") @@ -227,7 +224,7 @@ class DataverseHelper(FilesetPlatformHelper): platform_sub_id = platform_id.split('/')[-1] archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}" archiveorg_item_meta = dict( - # XXX: collection=platform_domain, + # TODO: collection=platform_domain, collection="datasets", date=obj_latest['releaseTime'].split('T')[0], source= @@ -439,7 +436,7 @@ class FigshareHelper(FilesetPlatformHelper): authors.append(author['full_name']) archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}" archiveorg_item_meta = dict( - # XXX: collection=platform_domain, + # TODO: collection=platform_domain, collection="datasets", creator=authors, doi=obj['doi'], @@ -524,7 +521,7 @@ class ZenodoHelper(FilesetPlatformHelper): else: url = request['base_url'] - # XXX: also look in base_url and resource-non-terminal for ident? to + # TODO: also look in base_url and resource-non-terminal for ident? to # check for work-level redirects # 1. extract identifier from URL @@ -581,7 +578,7 @@ class ZenodoHelper(FilesetPlatformHelper): authors.append(author['name']) archiveorg_item_name = f"{platform_domain}-{platform_id}" archiveorg_item_meta = dict( - # XXX: collection=platform_domain, + # TODO: collection=platform_domain, collection="datasets", creator=authors, doi=obj['doi'], diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index d88fb46..172e1d7 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -80,7 +80,7 @@ class IngestFilesetWorker(IngestFileWorker): # check against blocklist for block in self.base_url_blocklist: - # XXX: hack to not skip archive.org content + # NOTE: hack to not skip archive.org content if 'archive.org' in block: continue if block in next_url: |