From eb0099e4089efd07385379e105d4e30d1997408c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 2 Sep 2020 19:28:19 -0700 Subject: generic file entity clean-ups as part of file_meta importer --- python/fatcat_tools/importers/common.py | 47 ++++++++++++++++++++++++++++++ python/fatcat_tools/importers/file_meta.py | 3 ++ 2 files changed, 50 insertions(+) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index c692a38d..b34ba6f2 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -537,6 +537,53 @@ class EntityImporter: return None return self._issn_issnl_map.get(issn) + @staticmethod + def generic_file_cleanups(existing): + """ + Conservative cleanup of existing file entities. + + Intended to be used in most bulk cleanups and other file entity + updates, to reduce edit volume for catalog size/churn efficiency. + + Note: the former check for 'None' as a wayback datetime has been + completely cleaned up + """ + + # update old/deprecated 'rel' on URLs + for i in range(len(existing.urls)): + u = existing.urls[i] + if u.rel == 'repository' and '://archive.org/download/' in u.url: + existing.urls[i].rel = 'archive' + if u.rel == 'social': + u.rel = 'academicsocial' + + # remove URLs which are near-duplicates + redundant_urls = [] + all_urls = [u.url for u in existing.urls] + all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url] + for url in all_urls: + # https/http redundancy + if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls: + redundant_urls.append(url) + continue + # default HTTP port included and not included + if ':80/' in url and url.replace(':80', '', 1) in all_urls: + redundant_urls.append(url) + continue + # partial and complete wayback timestamps + if '://web.archive.org/web/2017/' in url: + original_url = "/".join(url.split("/")[5:]) + assert len(original_url) > 5 + for wb_url in all_wayback_urls: + alt_timestamp = wb_url.split("/")[4] + print(alt_timestamp) + if len(alt_timestamp) >= 10 and original_url in wb_url: + redundant_urls.append(url) + break + + existing.urls = [u for u in existing.urls if u.url not in redundant_urls] + return existing + class RecordPusher: """ diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 628ebde8..9f4b9e06 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -70,6 +70,9 @@ class FileMetaImporter(EntityImporter): existing.size = existing.size or fe.size existing.mimetype = existing.mimetype or fe.mimetype + # generic file entity cleanups + existing = self.generic_file_cleanups(existing) + self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 return False -- cgit v1.2.3