diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-09-02 19:28:19 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-09-02 19:28:19 -0700 | 
| commit | eb0099e4089efd07385379e105d4e30d1997408c (patch) | |
| tree | d3e231be0179057ddd0f869f0e11a9d12cc485e6 /python | |
| parent | 2e8e22b798c190a84e6cdcd6b66fd64f43f2631b (diff) | |
| download | fatcat-eb0099e4089efd07385379e105d4e30d1997408c.tar.gz fatcat-eb0099e4089efd07385379e105d4e30d1997408c.zip | |
generic file entity clean-ups as part of file_meta importer
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 47 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/file_meta.py | 3 | ||||
| -rw-r--r-- | python/tests/import_file_generic.py | 99 | 
3 files changed, 149 insertions, 0 deletions
| diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index c692a38d..b34ba6f2 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -537,6 +537,53 @@ class EntityImporter:              return None          return self._issn_issnl_map.get(issn) +    @staticmethod +    def generic_file_cleanups(existing): +        """ +        Conservative cleanup of existing file entities. + +        Intended to be used in most bulk cleanups and other file entity +        updates, to reduce edit volume for catalog size/churn efficiency. + +        Note: the former check for 'None' as a wayback datetime has been +        completely cleaned up +        """ + +        # update old/deprecated 'rel' on URLs +        for i in range(len(existing.urls)): +            u = existing.urls[i] +            if u.rel == 'repository' and '://archive.org/download/' in u.url: +                existing.urls[i].rel = 'archive' +            if u.rel == 'social': +                u.rel = 'academicsocial' + +        # remove URLs which are near-duplicates +        redundant_urls = [] +        all_urls = [u.url for u in existing.urls] +        all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url] +        for url in all_urls: +            # https/http redundancy +            if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls: +                redundant_urls.append(url) +                continue +            # default HTTP port included and not included +            if ':80/' in url and url.replace(':80', '', 1) in all_urls: +                redundant_urls.append(url) +                continue +            # partial and complete wayback timestamps +            if '://web.archive.org/web/2017/' in url: +                original_url = "/".join(url.split("/")[5:]) +                assert len(original_url) > 5 +                for wb_url in all_wayback_urls: +                    alt_timestamp = wb_url.split("/")[4] +                    print(alt_timestamp) +                    if len(alt_timestamp) >= 10 and original_url in wb_url: +                        redundant_urls.append(url) +                        break + +        existing.urls = [u for u in existing.urls if u.url not in redundant_urls] +        return existing +  class RecordPusher:      """ diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 628ebde8..9f4b9e06 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -70,6 +70,9 @@ class FileMetaImporter(EntityImporter):          existing.size = existing.size or fe.size          existing.mimetype = existing.mimetype or fe.mimetype +        # generic file entity cleanups +        existing = self.generic_file_cleanups(existing) +          self.api.update_file(self.get_editgroup_id(), existing.ident, existing)          self.counts['update'] += 1          return False diff --git a/python/tests/import_file_generic.py b/python/tests/import_file_generic.py new file mode 100644 index 00000000..cef82777 --- /dev/null +++ b/python/tests/import_file_generic.py @@ -0,0 +1,99 @@ + +import pytest + +from fatcat_tools.importers.common import EntityImporter +from fatcat_openapi_client import * + + +def test_file_update_generic(): + +    f1 = FileEntity( +        size=89238, +        md5="7ce6615b2a5904939576d9567bd5f68e", +        sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2", +        sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3", +        mimetype="application/pdf", +        urls=[], +        release_ids=[], +        extra=dict(a=2, b=5), +        edit_extra=dict(test_key="files rule"), +    ) +    assert f1 == EntityImporter.generic_file_cleanups(f1) + +    url_sets = [ +        # dummy +        { +            'before': [], +            'after': [], +        }, +        # social => academicsocial +        { +            'before': [ +                FileUrl(url="https://academic.edu/blah.pdf", rel="social"), +            ], +            'after': [ +                FileUrl(url="https://academic.edu/blah.pdf", rel="academicsocial"), +            ], +        }, +        # archive.org repository => archive +        { +            'before': [ +                FileUrl(url="https://archive.org/download/item/blah.pdf", rel="repository"), +            ], +            'after': [ +                FileUrl(url="https://archive.org/download/item/blah.pdf", rel="archive"), +            ], +        }, +        # :80 in URL is redundant +        { +            'before': [ +                FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), +                FileUrl(url="http://homepages.math.uic.edu:80/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), +                FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), +                FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), +            ], +            'after': [ +                FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), +                FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), +            ], +        }, +        { +            'before': [ +                FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), +            ], +            'after': [ +                FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), +            ], +        }, +        # http/https redundant +        { +            'before': [ +                FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), +                FileUrl(url="http://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), +                FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"), +                FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"), +                FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), +                FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), +            ], +            'after': [ +                FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), +                FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"), +                FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"), +                FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), +            ], +        }, +        # short /2017/ wayback datetime +        { +            'before': [ +                FileUrl(url="https://web.archive.org/web/2017/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), +                FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), +            ], +            'after': [ +                FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), +            ], +        }, +    ] + +    for pair in url_sets: +        f1.urls = pair['before'] +        assert EntityImporter.generic_file_cleanups(f1).urls == pair['after'] | 
