1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
import pytest
from fatcat_tools.importers.common import EntityImporter
from fatcat_openapi_client import *
def test_file_update_generic():
f1 = FileEntity(
size=89238,
md5="7ce6615b2a5904939576d9567bd5f68e",
sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2",
sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3",
mimetype="application/pdf",
urls=[],
release_ids=[],
extra=dict(a=2, b=5),
edit_extra=dict(test_key="files rule"),
)
assert f1 == EntityImporter.generic_file_cleanups(f1)
url_sets = [
# dummy
{
'before': [],
'after': [],
},
# social => academicsocial
{
'before': [
FileUrl(url="https://academic.edu/blah.pdf", rel="social"),
],
'after': [
FileUrl(url="https://academic.edu/blah.pdf", rel="academicsocial"),
],
},
# archive.org repository => archive
{
'before': [
FileUrl(url="https://archive.org/download/item/blah.pdf", rel="repository"),
],
'after': [
FileUrl(url="https://archive.org/download/item/blah.pdf", rel="archive"),
],
},
# :80 in URL is redundant
{
'before': [
FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"),
FileUrl(url="http://homepages.math.uic.edu:80/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"),
FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"),
FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"),
],
'after': [
FileUrl(url="http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf", rel="web"),
FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"),
],
},
{
'before': [
FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"),
],
'after': [
FileUrl(url="http://mit.edu:80/item/blah.pdf", rel="web"),
],
},
# http/https redundant
{
'before': [
FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"),
FileUrl(url="http://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"),
FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"),
FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"),
FileUrl(url="http://mit.edu/item/blah.pdf", rel="web"),
FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"),
],
'after': [
FileUrl(url="https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf", rel="web"),
FileUrl(url="https://mit.edu/item/blah.pdf", rel="web"),
FileUrl(url="https://web.archive.org/web/12345542/http://mit.edu/item/blah.pdf", rel="webarchive"),
FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"),
],
},
# short /2017/ wayback datetime
{
'before': [
FileUrl(url="https://web.archive.org/web/2017/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"),
FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"),
],
'after': [
FileUrl(url="https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38", rel="webarchive"),
],
},
]
for pair in url_sets:
f1.urls = pair['before']
assert EntityImporter.generic_file_cleanups(f1).urls == pair['after']
|