summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-10-12 14:44:44 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-10-14 18:11:12 -0700
commit5eccb38074104960d88df00805d0ebd7ecf839f9 (patch)
treea11b298e49e73686a5afc7bc7b34e1d0b0a86a79
parent75baf7d423a2cb119bd485672a00fd664e32537c (diff)
downloadfatcat-5eccb38074104960d88df00805d0ebd7ecf839f9.tar.gz
fatcat-5eccb38074104960d88df00805d0ebd7ecf839f9.zip
fileset ingest small tweaks
-rw-r--r--python/fatcat_tools/importers/ingest.py57
1 files changed, 36 insertions, 21 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 38639297..36d72651 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -624,30 +624,45 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
# XXX: create URLs and rel for dataset ingest
if not row.get('strategy'):
return []
- if row['strategy'].startswith('archiveorg') and row.get('archiveorg_item_name'):
- return [
- fatcat_openapi_client.FilesetUrl(
- url=f"https://archive.org/download/{row['archiveorg_item_name']}",
- rel="archive",
- )
- ]
- elif row['strategy'].startswith('web') and row.get('web_base_url'):
- return [
- fatcat_openapi_client.FilesetUrl(
- url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
- rel="webarchive",
- )
- ]
- elif row['strategy'] == 'web-file-bundle' and row.get('web_bundle_url'):
- return [
- fatcat_openapi_client.FilesetUrl(
- url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
- rel="webarchive",
- )
- ]
+ urls = []
+ if row['strategy'] == 'archiveorg-fileset' and row.get('archiveorg_item_name'):
+ urls.append(fatcat_openapi_client.FilesetUrl(
+ url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
+ rel="archive",
+ ))
+ elif row['strategy'] == 'archiveorg-file-hundle' and row.get('archiveorg_item_name'):
+ # XXX: what is the filename of bundle?
+ urls.append(fatcat_openapi_client.FilesetUrl(
+ url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
+ rel="archive",
+ ))
+ elif row['strategy'].startswith('web') and row.get('platform_base_url'):
+ urls.append(fatcat_openapi_client.FilesetUrl(
+ url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
+ rel="webarchive",
+ ))
+ elif row['strategy'] == 'web-file-bundle' and row.get('platform_bundle_url'):
+ urls.append(fatcat_openapi_client.FilesetUrl(
+ url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
+ rel="webarchive",
+ ))
else:
+ # if no archival URLs, bail out
return []
+ # add any additional / platform URLs here
+ if row.get('platform_bundle_url'):
+ urls.append(fatcat_openapi_client.FilesetUrl(
+ url=row['platform_bundle_url'],
+ rel="repository-bundle",
+ ))
+ if row.get('platform_base_url'):
+ urls.append(fatcat_openapi_client.FilesetUrl(
+ url=row['platform_bundle_url'],
+ rel="repository",
+ ))
+ return urls
+
def parse_record(self, row):
request = row['request']