From 5eccb38074104960d88df00805d0ebd7ecf839f9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 12 Oct 2021 14:44:44 -0700 Subject: fileset ingest small tweaks --- python/fatcat_tools/importers/ingest.py | 57 +++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 21 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 38639297..36d72651 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -624,30 +624,45 @@ class IngestFilesetResultImporter(IngestFileResultImporter): # XXX: create URLs and rel for dataset ingest if not row.get('strategy'): return [] - if row['strategy'].startswith('archiveorg') and row.get('archiveorg_item_name'): - return [ - fatcat_openapi_client.FilesetUrl( - url=f"https://archive.org/download/{row['archiveorg_item_name']}", - rel="archive", - ) - ] - elif row['strategy'].startswith('web') and row.get('web_base_url'): - return [ - fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", - rel="webarchive", - ) - ] - elif row['strategy'] == 'web-file-bundle' and row.get('web_bundle_url'): - return [ - fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", - rel="webarchive", - ) - ] + urls = [] + if row['strategy'] == 'archiveorg-fileset' and row.get('archiveorg_item_name'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/", + rel="archive", + )) + elif row['strategy'] == 'archiveorg-file-hundle' and row.get('archiveorg_item_name'): + # XXX: what is the filename of bundle? + urls.append(fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/", + rel="archive", + )) + elif row['strategy'].startswith('web') and row.get('platform_base_url'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", + rel="webarchive", + )) + elif row['strategy'] == 'web-file-bundle' and row.get('platform_bundle_url'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", + rel="webarchive", + )) else: + # if no archival URLs, bail out return [] + # add any additional / platform URLs here + if row.get('platform_bundle_url'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=row['platform_bundle_url'], + rel="repository-bundle", + )) + if row.get('platform_base_url'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=row['platform_bundle_url'], + rel="repository", + )) + return urls + def parse_record(self, row): request = row['request'] -- cgit v1.2.3