From eb0099e4089efd07385379e105d4e30d1997408c Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 2 Sep 2020 19:28:19 -0700
Subject: generic file entity clean-ups as part of file_meta importer

---
 python/fatcat_tools/importers/common.py    | 47 ++++++++++++++++++++++++++++++
 python/fatcat_tools/importers/file_meta.py |  3 ++
 2 files changed, 50 insertions(+)

(limited to 'python/fatcat_tools/importers')

diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index c692a38d..b34ba6f2 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -537,6 +537,53 @@ class EntityImporter:
             return None
         return self._issn_issnl_map.get(issn)
 
+    @staticmethod
+    def generic_file_cleanups(existing):
+        """
+        Conservative cleanup of existing file entities.
+
+        Intended to be used in most bulk cleanups and other file entity
+        updates, to reduce edit volume for catalog size/churn efficiency.
+
+        Note: the former check for 'None' as a wayback datetime has been
+        completely cleaned up
+        """
+
+        # update old/deprecated 'rel' on URLs
+        for i in range(len(existing.urls)):
+            u = existing.urls[i]
+            if u.rel == 'repository' and '://archive.org/download/' in u.url:
+                existing.urls[i].rel = 'archive'
+            if u.rel == 'social':
+                u.rel = 'academicsocial'
+
+        # remove URLs which are near-duplicates
+        redundant_urls = []
+        all_urls = [u.url for u in existing.urls]
+        all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url]
+        for url in all_urls:
+            # https/http redundancy
+            if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls:
+                redundant_urls.append(url)
+                continue
+            # default HTTP port included and not included
+            if ':80/' in url and url.replace(':80', '', 1) in all_urls:
+                redundant_urls.append(url)
+                continue
+            # partial and complete wayback timestamps
+            if '://web.archive.org/web/2017/' in url:
+                original_url = "/".join(url.split("/")[5:])
+                assert len(original_url) > 5
+                for wb_url in all_wayback_urls:
+                    alt_timestamp = wb_url.split("/")[4]
+                    print(alt_timestamp)
+                    if len(alt_timestamp) >= 10 and original_url in wb_url:
+                        redundant_urls.append(url)
+                        break
+
+        existing.urls = [u for u in existing.urls if u.url not in redundant_urls]
+        return existing
+
 
 class RecordPusher:
     """
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
index 628ebde8..9f4b9e06 100644
--- a/python/fatcat_tools/importers/file_meta.py
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -70,6 +70,9 @@ class FileMetaImporter(EntityImporter):
         existing.size = existing.size or fe.size
         existing.mimetype = existing.mimetype or fe.mimetype
 
+        # generic file entity cleanups
+        existing = self.generic_file_cleanups(existing)
+
         self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
         self.counts['update'] += 1
         return False
-- 
cgit v1.2.3