aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 18:15:33 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 18:15:33 -0700
commit4c0e3f9c02692cd2cab0657d7fbcd1861a586076 (patch)
treed3a06b0425d4e3477ee3381b23407e92e6c46f45 /python
parent41fae4c294e2ba43370b4a4193c0f6107201dbf0 (diff)
downloadsandcrawler-4c0e3f9c02692cd2cab0657d7fbcd1861a586076.tar.gz
sandcrawler-4c0e3f9c02692cd2cab0657d7fbcd1861a586076.zip
update 'XXX' notes from fileset ingest development
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/fileset_platforms.py13
-rw-r--r--python/sandcrawler/ingest_fileset.py2
2 files changed, 6 insertions, 9 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index b6808b5..6ab4781 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -32,9 +32,6 @@ class FilesetPlatformHelper():
assert item.manifest
total_size = sum([m.size for m in item.manifest]) or 0
largest_size = max([m.size or 0 for m in item.manifest]) or 0
- #print(f" total_size={total_size} largest_size={largest_size}", file=sys.stderr)
- # XXX: while developing ArchiveorgFileset path
- #return IngestStrategy.ArchiveorgFileset
if len(item.manifest) == 1:
if total_size < 64 * 1024 * 1024:
return IngestStrategy.WebFile
@@ -174,7 +171,7 @@ class DataverseHelper(FilesetPlatformHelper):
raise PlatformScopeError("not actually in scope")
if parsed_id['file_id']:
- # XXX: maybe we could support this?
+ # TODO: maybe we could support this?
raise PlatformScopeError(
"only entire dataverse datasets can be archived with this tool")
@@ -227,7 +224,7 @@ class DataverseHelper(FilesetPlatformHelper):
platform_sub_id = platform_id.split('/')[-1]
archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}"
archiveorg_item_meta = dict(
- # XXX: collection=platform_domain,
+ # TODO: collection=platform_domain,
collection="datasets",
date=obj_latest['releaseTime'].split('T')[0],
source=
@@ -439,7 +436,7 @@ class FigshareHelper(FilesetPlatformHelper):
authors.append(author['full_name'])
archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}"
archiveorg_item_meta = dict(
- # XXX: collection=platform_domain,
+ # TODO: collection=platform_domain,
collection="datasets",
creator=authors,
doi=obj['doi'],
@@ -524,7 +521,7 @@ class ZenodoHelper(FilesetPlatformHelper):
else:
url = request['base_url']
- # XXX: also look in base_url and resource-non-terminal for ident? to
+ # TODO: also look in base_url and resource-non-terminal for ident? to
# check for work-level redirects
# 1. extract identifier from URL
@@ -581,7 +578,7 @@ class ZenodoHelper(FilesetPlatformHelper):
authors.append(author['name'])
archiveorg_item_name = f"{platform_domain}-{platform_id}"
archiveorg_item_meta = dict(
- # XXX: collection=platform_domain,
+ # TODO: collection=platform_domain,
collection="datasets",
creator=authors,
doi=obj['doi'],
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index d88fb46..172e1d7 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -80,7 +80,7 @@ class IngestFilesetWorker(IngestFileWorker):
# check against blocklist
for block in self.base_url_blocklist:
- # XXX: hack to not skip archive.org content
+ # NOTE: hack to not skip archive.org content
if 'archive.org' in block:
continue
if block in next_url: