summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-09-02 19:28:19 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-09-02 19:28:19 -0700
commiteb0099e4089efd07385379e105d4e30d1997408c (patch)
treed3e231be0179057ddd0f869f0e11a9d12cc485e6 /python/fatcat_tools/importers
parent2e8e22b798c190a84e6cdcd6b66fd64f43f2631b (diff)
downloadfatcat-eb0099e4089efd07385379e105d4e30d1997408c.tar.gz
fatcat-eb0099e4089efd07385379e105d4e30d1997408c.zip
generic file entity clean-ups as part of file_meta importer
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/common.py47
-rw-r--r--python/fatcat_tools/importers/file_meta.py3
2 files changed, 50 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index c692a38d..b34ba6f2 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -537,6 +537,53 @@ class EntityImporter:
return None
return self._issn_issnl_map.get(issn)
+ @staticmethod
+ def generic_file_cleanups(existing):
+ """
+ Conservative cleanup of existing file entities.
+
+ Intended to be used in most bulk cleanups and other file entity
+ updates, to reduce edit volume for catalog size/churn efficiency.
+
+ Note: the former check for 'None' as a wayback datetime has been
+ completely cleaned up
+ """
+
+ # update old/deprecated 'rel' on URLs
+ for i in range(len(existing.urls)):
+ u = existing.urls[i]
+ if u.rel == 'repository' and '://archive.org/download/' in u.url:
+ existing.urls[i].rel = 'archive'
+ if u.rel == 'social':
+ u.rel = 'academicsocial'
+
+ # remove URLs which are near-duplicates
+ redundant_urls = []
+ all_urls = [u.url for u in existing.urls]
+ all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url]
+ for url in all_urls:
+ # https/http redundancy
+ if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls:
+ redundant_urls.append(url)
+ continue
+ # default HTTP port included and not included
+ if ':80/' in url and url.replace(':80', '', 1) in all_urls:
+ redundant_urls.append(url)
+ continue
+ # partial and complete wayback timestamps
+ if '://web.archive.org/web/2017/' in url:
+ original_url = "/".join(url.split("/")[5:])
+ assert len(original_url) > 5
+ for wb_url in all_wayback_urls:
+ alt_timestamp = wb_url.split("/")[4]
+ print(alt_timestamp)
+ if len(alt_timestamp) >= 10 and original_url in wb_url:
+ redundant_urls.append(url)
+ break
+
+ existing.urls = [u for u in existing.urls if u.url not in redundant_urls]
+ return existing
+
class RecordPusher:
"""
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
index 628ebde8..9f4b9e06 100644
--- a/python/fatcat_tools/importers/file_meta.py
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -70,6 +70,9 @@ class FileMetaImporter(EntityImporter):
existing.size = existing.size or fe.size
existing.mimetype = existing.mimetype or fe.mimetype
+ # generic file entity cleanups
+ existing = self.generic_file_cleanups(existing)
+
self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
self.counts['update'] += 1
return False