From eb0099e4089efd07385379e105d4e30d1997408c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 2 Sep 2020 19:28:19 -0700 Subject: generic file entity clean-ups as part of file_meta importer --- python/fatcat_tools/importers/common.py | 47 ++++++++++++++ python/fatcat_tools/importers/file_meta.py | 3 + python/tests/import_file_generic.py | 99 ++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 python/tests/import_file_generic.py diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index c692a38d..b34ba6f2 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -537,6 +537,53 @@ class EntityImporter: return None return self._issn_issnl_map.get(issn) + @staticmethod + def generic_file_cleanups(existing): + """ + Conservative cleanup of existing file entities. + + Intended to be used in most bulk cleanups and other file entity + updates, to reduce edit volume for catalog size/churn efficiency. + + Note: the former check for 'None' as a wayback datetime has been + completely cleaned up + """ + + # update old/deprecated 'rel' on URLs + for i in range(len(existing.urls)): + u = existing.urls[i] + if u.rel == 'repository' and '://archive.org/download/' in u.url: + existing.urls[i].rel = 'archive' + if u.rel == 'social': + u.rel = 'academicsocial' + + # remove URLs which are near-duplicates + redundant_urls = [] + all_urls = [u.url for u in existing.urls] + all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url] + for url in all_urls: + # https/http redundancy + if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls: + redundant_urls.append(url) + continue + # default HTTP port included and not included + if ':80/' in url and url.replace(':80', '', 1) in all_urls: + redundant_urls.append(url) + continue + # partial and complete wayback timestamps + if '://web.archive.org/web/2017/' in url: + original_url = "/".join(url.split("/")[5:]) + assert len(original_url) > 5 + for wb_url in all_wayback_urls: + alt_timestamp = wb_url.split("/")[4] + print(alt_timestamp) + if len(alt_timestamp) >= 10 and original_url in wb_url: + redundant_urls.append(url) + break + + existing.urls = [u for u in existing.urls if u.url not in redundant_urls] + return existing + class RecordPusher: """ diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 628ebde8..9f4b9e06 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -70,6 +70,9 @@ class FileMetaImporter(EntityImporter): existing.size = existing.size or fe.size existing.mimetype = existing.mimetype or fe.mimetype + # generic file entity cleanups + existing = self.generic_file_cleanups(existing) + self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 return False diff --git a/python/tests/import_file_generic.py b/python/tests/import_file_generic.py new file mode 100644 index 00000000..cef82777 --- /dev/null +++ b/python/tests/import_file_generic.py @@ -0,0 +1,99 @@ + +import pytest + +from fatcat_tools.importers.common import EntityImporter +from fatcat_openapi_client import * + + +def test_file_update_generic(): + + f1 = FileEntity( + size=89238, + md5="7ce6615b2a5904939576d9567bd5f68e", + sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2", + sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3", + mimetype="application/pdf", + urls=[], + release_ids=[], + extra=dict(a=2, b=5), + edit_extra=dict(test_key="files rule"), + ) + assert f1 == EntityImporter.generic_file_cleanups(f1) + + url_sets = [ + # dummy + { + 'before': [], + 'after': [], + }, + # social => academicsocial + { + 'before': [ + FileUrl(url="https://academic.edu/blah.pdf", rel="social"), + ], + 'after': [ + FileUrl(url="https://academic.edu/blah.pdf", rel="academicsocial"), + ], + }, + # archive.org repository => archive + { + 'before': [ + FileUrl(url="https://archive.org/download/item/blah.pdf", rel="repository"), + ], + 'after': [ + FileUrl(url="https://archive.org/download/item/blah.pdf", rel="archive"), + ], + }, + # :80 in URL is redundant + { + 'before': [ + FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), + FileUrl(url="http://homepages.math.uic.edu:80/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), + FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), + FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), + ], + 'after': [ + FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), + FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), + ], + }, + { + 'before': [ + FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), + ], + 'after': [ + FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), + ], + }, + # http/https redundant + { + 'before': [ + FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), + FileUrl(url="http://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), + FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"), + FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"), + FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), + FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), + ], + 'after': [ + FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), + FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"), + FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"), + FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), + ], + }, + # short /2017/ wayback datetime + { + 'before': [ + FileUrl(url="https://web.archive.org/web/2017/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), + FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), + ], + 'after': [ + FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), + ], + }, + ] + + for pair in url_sets: + f1.urls = pair['before'] + assert EntityImporter.generic_file_cleanups(f1).urls == pair['after'] -- cgit v1.2.3