update 'XXX' notes from fileset ingest development

author: Bryan Newbold <bnewbold@archive.org> 2021-10-26 18:15:33 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-26 18:15:33 -0700
commit: 4c0e3f9c02692cd2cab0657d7fbcd1861a586076 (patch)
tree: d3a06b0425d4e3477ee3381b23407e92e6c46f45 /python
parent: 41fae4c294e2ba43370b4a4193c0f6107201dbf0 (diff)
download: sandcrawler-4c0e3f9c02692cd2cab0657d7fbcd1861a586076.tar.gz
sandcrawler-4c0e3f9c02692cd2cab0657d7fbcd1861a586076.zip
2 files changed, 6 insertions, 9 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index b6808b5..6ab4781 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -32,9 +32,6 @@ class FilesetPlatformHelper():
         assert item.manifest
         total_size = sum([m.size for m in item.manifest]) or 0
         largest_size = max([m.size or 0 for m in item.manifest]) or 0
-        #print(f"  total_size={total_size} largest_size={largest_size}", file=sys.stderr)
-        # XXX: while developing ArchiveorgFileset path
-        #return IngestStrategy.ArchiveorgFileset
         if len(item.manifest) == 1:
             if total_size < 64 * 1024 * 1024:
                 return IngestStrategy.WebFile
@@ -174,7 +171,7 @@ class DataverseHelper(FilesetPlatformHelper):
             raise PlatformScopeError("not actually in scope")
 
         if parsed_id['file_id']:
-            # XXX: maybe we could support this?
+            # TODO: maybe we could support this?
             raise PlatformScopeError(
                 "only entire dataverse datasets can be archived with this tool")
 
@@ -227,7 +224,7 @@ class DataverseHelper(FilesetPlatformHelper):
         platform_sub_id = platform_id.split('/')[-1]
         archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}"
         archiveorg_item_meta = dict(
-            # XXX: collection=platform_domain,
+            # TODO: collection=platform_domain,
             collection="datasets",
             date=obj_latest['releaseTime'].split('T')[0],
             source=
@@ -439,7 +436,7 @@ class FigshareHelper(FilesetPlatformHelper):
             authors.append(author['full_name'])
         archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}"
         archiveorg_item_meta = dict(
-            # XXX: collection=platform_domain,
+            # TODO: collection=platform_domain,
             collection="datasets",
             creator=authors,
             doi=obj['doi'],
@@ -524,7 +521,7 @@ class ZenodoHelper(FilesetPlatformHelper):
         else:
             url = request['base_url']
 
-        # XXX: also look in base_url and resource-non-terminal for ident? to
+        # TODO: also look in base_url and resource-non-terminal for ident? to
         # check for work-level redirects
 
         # 1. extract identifier from URL
@@ -581,7 +578,7 @@ class ZenodoHelper(FilesetPlatformHelper):
             authors.append(author['name'])
         archiveorg_item_name = f"{platform_domain}-{platform_id}"
         archiveorg_item_meta = dict(
-            # XXX: collection=platform_domain,
+            # TODO: collection=platform_domain,
             collection="datasets",
             creator=authors,
             doi=obj['doi'],
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index d88fb46..172e1d7 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -80,7 +80,7 @@ class IngestFilesetWorker(IngestFileWorker):
 
         # check against blocklist
         for block in self.base_url_blocklist:
-            # XXX: hack to not skip archive.org content
+            # NOTE: hack to not skip archive.org content
             if 'archive.org' in block:
                 continue
             if block in next_url:
author	Bryan Newbold <bnewbold@archive.org>	2021-10-26 18:15:33 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-26 18:15:33 -0700
commit	4c0e3f9c02692cd2cab0657d7fbcd1861a586076 (patch)
tree	d3a06b0425d4e3477ee3381b23407e92e6c46f45 /python
parent	41fae4c294e2ba43370b4a4193c0f6107201dbf0 (diff)
download	sandcrawler-4c0e3f9c02692cd2cab0657d7fbcd1861a586076.tar.gz sandcrawler-4c0e3f9c02692cd2cab0657d7fbcd1861a586076.zip