generic file entity clean-ups as part of file_meta importer

author: Bryan Newbold <bnewbold@robocracy.org> 2020-09-02 19:28:19 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-09-02 19:28:19 -0700
commit: eb0099e4089efd07385379e105d4e30d1997408c (patch)
tree: d3e231be0179057ddd0f869f0e11a9d12cc485e6
parent: 2e8e22b798c190a84e6cdcd6b66fd64f43f2631b (diff)
download: fatcat-eb0099e4089efd07385379e105d4e30d1997408c.tar.gz
fatcat-eb0099e4089efd07385379e105d4e30d1997408c.zip
3 files changed, 149 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index c692a38d..b34ba6f2 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -537,6 +537,53 @@ class EntityImporter:
             return None
         return self._issn_issnl_map.get(issn)
 
+    @staticmethod
+    def generic_file_cleanups(existing):
+        """
+        Conservative cleanup of existing file entities.
+
+        Intended to be used in most bulk cleanups and other file entity
+        updates, to reduce edit volume for catalog size/churn efficiency.
+
+        Note: the former check for 'None' as a wayback datetime has been
+        completely cleaned up
+        """
+
+        # update old/deprecated 'rel' on URLs
+        for i in range(len(existing.urls)):
+            u = existing.urls[i]
+            if u.rel == 'repository' and '://archive.org/download/' in u.url:
+                existing.urls[i].rel = 'archive'
+            if u.rel == 'social':
+                u.rel = 'academicsocial'
+
+        # remove URLs which are near-duplicates
+        redundant_urls = []
+        all_urls = [u.url for u in existing.urls]
+        all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url]
+        for url in all_urls:
+            # https/http redundancy
+            if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls:
+                redundant_urls.append(url)
+                continue
+            # default HTTP port included and not included
+            if ':80/' in url and url.replace(':80', '', 1) in all_urls:
+                redundant_urls.append(url)
+                continue
+            # partial and complete wayback timestamps
+            if '://web.archive.org/web/2017/' in url:
+                original_url = "/".join(url.split("/")[5:])
+                assert len(original_url) > 5
+                for wb_url in all_wayback_urls:
+                    alt_timestamp = wb_url.split("/")[4]
+                    print(alt_timestamp)
+                    if len(alt_timestamp) >= 10 and original_url in wb_url:
+                        redundant_urls.append(url)
+                        break
+
+        existing.urls = [u for u in existing.urls if u.url not in redundant_urls]
+        return existing
+
 
 class RecordPusher:
     """
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
index 628ebde8..9f4b9e06 100644
--- a/python/fatcat_tools/importers/file_meta.py
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -70,6 +70,9 @@ class FileMetaImporter(EntityImporter):
         existing.size = existing.size or fe.size
         existing.mimetype = existing.mimetype or fe.mimetype
 
+        # generic file entity cleanups
+        existing = self.generic_file_cleanups(existing)
+
         self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
         self.counts['update'] += 1
         return False
diff --git a/python/tests/import_file_generic.py b/python/tests/import_file_generic.py
new file mode 100644
index 00000000..cef82777
--- /dev/null
+++ b/python/tests/import_file_generic.py
@@ -0,0 +1,99 @@
+
+import pytest
+
+from fatcat_tools.importers.common import EntityImporter
+from fatcat_openapi_client import *
+
+
+def test_file_update_generic():
+
+    f1 = FileEntity(
+        size=89238,
+        md5="7ce6615b2a5904939576d9567bd5f68e",
+        sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2",
+        sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3",
+        mimetype="application/pdf",
+        urls=[],
+        release_ids=[],
+        extra=dict(a=2, b=5),
+        edit_extra=dict(test_key="files rule"),
+    )
+    assert f1 == EntityImporter.generic_file_cleanups(f1)
+
+    url_sets = [
+        # dummy
+        {
+            'before': [],
+            'after': [],
+        },
+        # social => academicsocial
+        {
+            'before': [
+                FileUrl(url="https://academic.edu/blah.pdf", rel="social"),
+            ],
+            'after': [
+                FileUrl(url="https://academic.edu/blah.pdf", rel="academicsocial"),
+            ],
+        },
+        # archive.org repository => archive
+        {
+            'before': [
+                FileUrl(url="https://archive.org/download/item/blah.pdf", rel="repository"),
+            ],
+            'after': [
+                FileUrl(url="https://archive.org/download/item/blah.pdf", rel="archive"),
+            ],
+        },
+        # :80 in URL is redundant
+        {
+            'before': [
+                FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"),
+                FileUrl(url="http://homepages.math.uic.edu:80/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"),
+                FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"),
+                FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"),
+            ],
+            'after': [
+                FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"),
+                FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"),
+            ],
+        },
+        {
+            'before': [
+                FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"),
+            ],
+            'after': [
+                FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"),
+            ],
+        },
+        # http/https redundant
+        {
+            'before': [
+                FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"),
+                FileUrl(url="http://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"),
+                FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"),
+                FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"),
+                FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"),
+                FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"),
+            ],
+            'after': [
+                FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"),
+                FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"),
+                FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"),
+                FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"),
+            ],
+        },
+        # short /2017/ wayback datetime
+        {
+            'before': [
+                FileUrl(url="https://web.archive.org/web/2017/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"),
+                FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"),
+            ],
+            'after': [
+                FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"),
+            ],
+        },
+    ]
+
+    for pair in url_sets:
+        f1.urls = pair['before']
+        assert EntityImporter.generic_file_cleanups(f1).urls == pair['after']
author	Bryan Newbold <bnewbold@robocracy.org>	2020-09-02 19:28:19 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-09-02 19:28:19 -0700
commit	eb0099e4089efd07385379e105d4e30d1997408c (patch)
tree	d3e231be0179057ddd0f869f0e11a9d12cc485e6
parent	2e8e22b798c190a84e6cdcd6b66fd64f43f2631b (diff)
download	fatcat-eb0099e4089efd07385379e105d4e30d1997408c.tar.gz fatcat-eb0099e4089efd07385379e105d4e30d1997408c.zip