diff options
author | Martin Czygan <martin@archive.org> | 2020-09-03 21:05:32 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2020-09-03 21:05:32 +0000 |
commit | cb03c66615e5316cd2a8f9c91420c9c1b5efe731 (patch) | |
tree | d3e231be0179057ddd0f869f0e11a9d12cc485e6 /python/tests/import_file_generic.py | |
parent | 2e8e22b798c190a84e6cdcd6b66fd64f43f2631b (diff) | |
parent | eb0099e4089efd07385379e105d4e30d1997408c (diff) | |
download | fatcat-cb03c66615e5316cd2a8f9c91420c9c1b5efe731.tar.gz fatcat-cb03c66615e5316cd2a8f9c91420c9c1b5efe731.zip |
Merge branch 'bnewbold-file-meta-cleanups' into 'master'
generic file entity clean-ups as part of file_meta importer
See merge request webgroup/fatcat!82
Diffstat (limited to 'python/tests/import_file_generic.py')
-rw-r--r-- | python/tests/import_file_generic.py | 99 |
1 files changed, 99 insertions, 0 deletions
diff --git a/python/tests/import_file_generic.py b/python/tests/import_file_generic.py new file mode 100644 index 00000000..cef82777 --- /dev/null +++ b/python/tests/import_file_generic.py @@ -0,0 +1,99 @@ + +import pytest + +from fatcat_tools.importers.common import EntityImporter +from fatcat_openapi_client import * + + +def test_file_update_generic(): + + f1 = FileEntity( + size=89238, + md5="7ce6615b2a5904939576d9567bd5f68e", + sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2", + sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3", + mimetype="application/pdf", + urls=[], + release_ids=[], + extra=dict(a=2, b=5), + edit_extra=dict(test_key="files rule"), + ) + assert f1 == EntityImporter.generic_file_cleanups(f1) + + url_sets = [ + # dummy + { + 'before': [], + 'after': [], + }, + # social => academicsocial + { + 'before': [ + FileUrl(url="https://academic.edu/blah.pdf", rel="social"), + ], + 'after': [ + FileUrl(url="https://academic.edu/blah.pdf", rel="academicsocial"), + ], + }, + # archive.org repository => archive + { + 'before': [ + FileUrl(url="https://archive.org/download/item/blah.pdf", rel="repository"), + ], + 'after': [ + FileUrl(url="https://archive.org/download/item/blah.pdf", rel="archive"), + ], + }, + # :80 in URL is redundant + { + 'before': [ + FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), + FileUrl(url="http://homepages.math.uic.edu:80/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), + FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), + FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), + ], + 'after': [ + FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"), + FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), + ], + }, + { + 'before': [ + FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), + ], + 'after': [ + FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"), + ], + }, + # http/https redundant + { + 'before': [ + FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), + FileUrl(url="http://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), + FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"), + FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"), + FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"), + FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), + ], + 'after': [ + FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"), + FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"), + FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"), + FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), + ], + }, + # short /2017/ wayback datetime + { + 'before': [ + FileUrl(url="https://web.archive.org/web/2017/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), + FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), + ], + 'after': [ + FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"), + ], + }, + ] + + for pair in url_sets: + f1.urls = pair['before'] + assert EntityImporter.generic_file_cleanups(f1).urls == pair['after'] |