summaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2020-12-17 18:10:36 +0000
committerMartin Czygan <martin@archive.org>2020-12-17 18:10:36 +0000
commit37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f (patch)
treed4de1d1d529776205568f55ad7f724e398e442c9 /python/tests
parentf7a75a019c9dee35542e6f92ec37937df36ff756 (diff)
parentf60ba0ea04081ac0095c12d8ecbaa48b3da74aee (diff)
downloadfatcat-37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f.tar.gz
fatcat-37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f.zip
Merge branch 'bnewbold-es-transform-html' into 'master'
Elasticsearch release transform updates: handle webcaptures better, and refactoring See merge request webgroup/fatcat!91
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json1
-rw-r--r--python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json1
-rw-r--r--python/tests/transform_elasticsearch.py95
3 files changed, 86 insertions, 11 deletions
diff --git a/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json b/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json
new file mode 100644
index 00000000..1c559509
--- /dev/null
+++ b/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json
@@ -0,0 +1 @@
+{"abstracts":[{"sha1":"b2523f13fc2aa730a2e2336f27d448644074e24f","content":"<p>Jakobshavn Isbræ, West Greenland, which holds a 0.6-m sea level volume equivalent, has been speeding up and retreating since the late 1990s. Interpretation of its retreat has been hindered by difficulties in measuring its ice thickness with airborne radar depth sounders. Here, we employ high-resolution, helicopter-borne gravity data from 2012 to reconstruct its bed elevation within 50 km of the ocean margin using a three-dimensional inversion constrained by fjord bathymetry data offshore and a mass conservation algorithm inland. We find the glacier trough to be asymmetric and several 100 m deeper than estimated previously in the lower part. From 1996-2016, the grounding line migrated at 0.6 km/yr from 700 m to 1,100 m depth. Upstream, the bed drops to 1,600 m over 10 km then slowly climbs to 1,200 m depth in 40 km. Jakobshavn Isbræ will continue to retreat along a retrograde slope for decades to come.\n\nAn L., E. Rignot, S.H.P. Elieff, M. Morlighem, R. Millan, J. Mouginot, D.M. Holland, D. Holland, and J. Paden (2017), Bed elevation of Jakobshavn Isbræ, West Greenland, from high-resolution airborne gravity and other data, Geophys. Res. Lett., 44, doi:10.1002/2017GL073245.\n\n</p>","mimetype":"text/html"}],"refs":[],"contribs":[{"raw_name":"Lu An","role":"author","raw_affiliation":"University of California, Irvine"}],"license_slug":"CC-BY","publisher":"UC Irvine","ext_ids":{"doi":"10.7280/d1j37z"},"release_year":2018,"release_type":"dataset","webcaptures":[],"filesets":[{"release_ids":["3mssw2qnlnblbk7oqyv2dafgey"],"urls":[{"url":"https://merritt.cdlib.org/u/ark%3A%2F13030%2Fm5rg0r8q/1","rel":"repo-bundle"},{"url":"https://merritt.cdlib.org/d/ark%3A%2F13030%2Fm5rg0r8q/1/","rel":"repo"},{"url":"dat://77e94744aa5f967e6ed7e3990bfc29f141dbf2c0fff572eb1212b3bd706882f4/files/","rel":"dweb"}],"manifest":[{"path":"JKS_BedElevation_An_etal_2017.nc","size":736484,"md5":"af738fa325833a56bf947622958fd504","sha1":"443f1867b3a56132905e8d611ad03445d8134d3c","sha256":"52438ef0035b391027e989f00208de5c16ab8f9ff619aa7f45e998d6214a452f","extra":{"mimetype":"application/x-netcdf"}}],"state":"active","ident":"ho376wmdanckpp66iwfs7g22ne","revision":"e07ab7b0-bc0e-4da2-9121-542263e84e2d","extra":{"cdl_dash":{"version":1}}}],"files":[],"work_id":"pbf2dmuu5jf4dac2k22gxsjk6y","title":"Jakobshavn Glacier Bed Elevation","state":"active","ident":"3mssw2qnlnblbk7oqyv2dafgey","revision":"23040a75-2aa6-49f2-af3c-a5c12dcceffe","extra":{"ark_id":"ark:/13030/m5rg0r8q","cdl_dash":{"version":1}}} \ No newline at end of file
diff --git a/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json b/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json
new file mode 100644
index 00000000..3bfe8564
--- /dev/null
+++ b/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json
@@ -0,0 +1 @@
+{"abstracts":[],"refs":[],"contribs":[{"index":0,"raw_name":"Catherine C. Marshall","role":"author","extra":{"seq":"first"}}],"language":"en","publisher":"CNRI Acct","issue":"3/4","volume":"14","ext_ids":{"doi":"10.1045/march2008-marshall-pt1"},"release_year":2008,"release_stage":"published","release_type":"article-journal","container_id":"ugbiirfvufgcjkx33r3cmemcuu","webcaptures":[{"release_ids":["mjtqtuyhwfdr7j2c3l36uor7uy"],"timestamp":"2019-01-06T18:58:12Z","original_url":"http://www.dlib.org/dlib/march08/marshall/03marshall-pt1.html","archive_urls":[{"url":"https://web.archive.org/web/","rel":"wayback"}],"cdx":[{"surt":"org,dlib)/dlib/march08/images/spacer00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/spacer00.gif","mimetype":"image/gif","status_code":200,"sha1":"0e75513436e6b01963759f6a88282445ff2e5b3a","sha256":"7455bacb03f7ef04d79010638db14d8434cf7a349914c2ee99eb5d4220338675"},{"surt":"org,dlib)/dlib/march08/marshall/marshall-part1-fig1.png","timestamp":"2019-01-06T19:51:01Z","url":"http://www.dlib.org/dlib/march08/marshall/marshall-part1-fig1.png","mimetype":"image/png","status_code":200,"sha1":"89cee41b938a1d2cdc51688b4be1c72366ae8102","sha256":"d63abfb99c9c48e1e6e3e37bbc5f01c0d37429f0ac0a404ae6aadc1a7d187b60"},{"surt":"org,dlib)/dlib/march08/images/redline00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/redline00.gif","mimetype":"image/gif","status_code":200,"sha1":"3a902e1d6075e37962ab37afc1567819bc3a164e","sha256":"3279d6916807f9e244beb23c91d58cd238509f77a26c06b14314f276b77b9c06"},{"surt":"org,dlib)/dlib/march08/images/commentary00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/commentary00.gif","mimetype":"image/gif","status_code":200,"sha1":"cdbf8804daa2627ef915db725b29cce9eaa9cd68","sha256":"8d8956e992a7f3004ccbbaaebe585ee4c2b1256ad418507d7c33f94b290d0b04"},{"surt":"org,dlib)/dlib/march08/style/main.css","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/style/main.css","mimetype":"text/css","status_code":200,"sha1":"425f00efb41156f03d5c139c1b24acfcbdd611cb","sha256":"ff811660270fc847b5efc3ff9d62967244c924f91a5e4796ac2e6fc8058440ff"},{"surt":"org,dlib)/dlib/march08/marshall/03marshall-pt1.html","timestamp":"2018-12-06T13:16:33Z","url":"http://www.dlib.org/dlib/march08/marshall/03marshall-pt1.html","mimetype":"text/html","status_code":200,"sha1":"8443a044aa1f4571dd1e5561d59150e34eff0dd2","sha256":"0e9c76cdf20db60b93f0d129e5336e5344aae8bd03c5dbd75a5eea8f5d1820da"}],"revision":"6019e2a1-3503-4e91-97ec-5fba3abc70af","ident":"z7uaeatyvfgwdpuxtrdu4okqii","state":"active"}],"filesets":[],"files":[],"container":{"wikidata_qid":"Q5203268","issnl":"1082-9873","publisher":"Corporation for National Research Initiatives","name":"D-Lib Magazine","extra":{"abbrev":"Dlib Mag","country":"us","issne":"1082-9873","road":{"as_of":"2018-01-24"},"szczepanski":{"as_of":"2018"},"urls":["http://www.dlib.org/"]},"revision":"3957936f-d418-4006-b830-71341068121c","ident":"ugbiirfvufgcjkx33r3cmemcuu","state":"active"},"work_id":"kqi27ogvjvcrtnritxwumkebya","title":"Rethinking Personal Digital Archiving, Part 1","state":"active","ident":"mjtqtuyhwfdr7j2c3l36uor7uy","revision":"74270e11-c961-47f7-a682-1f6ad5927205","extra":{"crossref":{"type":"journal-article"},"subtitle":["Four Challenges from the Field"]}}
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index 0d96e139..b5f23e76 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -43,7 +43,7 @@ def test_rich_elasticsearch_convert():
"year_spans": [[1200, 1300]],
},
"jstor": {
- "year_spans": [[1950, 1960], [1980, 2005]],
+ "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]],
},
},
"sherpa_romeo": {"color": "blue"},
@@ -63,17 +63,23 @@ def test_rich_elasticsearch_convert():
)]
es = release_to_elasticsearch(r)
assert es['release_year'] == r.release_year
- assert es['in_ia'] == True
- assert es['in_jstor'] == False
- assert es['in_ia_sim'] == False
- assert es['in_ia'] == True
- assert es['in_web'] == True
- assert es['in_dweb'] == True
- assert es['is_oa'] == True
- assert es['is_longtail_oa'] == False
+ assert es['file_count'] == 1
+ assert es['fileset_count'] == 0
+ assert es['webcapture_count'] == 0
assert es['ref_count'] == 2
assert es['ref_linked_count'] == 1
+ assert es['preservation'] == "bright"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == True
+ assert es['in_dweb'] == True
+ assert es['in_ia'] == True
+ assert es['in_ia_sim'] == False
+ assert es['in_kbart'] == True
+ assert es['in_jstor'] == True
+
def test_elasticsearch_release_from_json():
r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity)
es = release_to_elasticsearch(r)
@@ -85,8 +91,59 @@ def test_elasticsearch_release_from_json():
assert es['issue'] == "11"
assert es['volume'] == "118"
assert es['number'] == None
+
+ assert es['preservation'] == "dark"
+ assert es['is_oa'] == False
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == False
+ assert es['in_dweb'] == False
+ assert es['in_ia'] == False
assert es['in_ia_sim'] == True
assert es['in_kbart'] == True
+ assert es['in_jstor'] == False
+
+ # this release has a fileset, and no file
+ r = entity_from_json(open('./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json', 'r').read(), ReleaseEntity)
+ es = release_to_elasticsearch(r)
+
+ assert es['title'] == "Jakobshavn Glacier Bed Elevation"
+ assert es['ident'] == "3mssw2qnlnblbk7oqyv2dafgey"
+ assert es['file_count'] == 0
+ assert es['fileset_count'] == 1
+ assert es['webcapture_count'] == 0
+
+ assert es['preservation'] == "dark"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == True
+ assert es['in_dweb'] == True
+ assert es['in_ia'] == False
+ assert es['in_ia_sim'] == False
+ assert es['in_kbart'] == False
+ assert es['in_jstor'] == False
+
+ # this release has a web capture, and no file (edited the JSON to remove file)
+ r = entity_from_json(open('./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json', 'r').read(), ReleaseEntity)
+ es = release_to_elasticsearch(r)
+
+ assert es['title'] == "Rethinking Personal Digital Archiving, Part 1"
+ assert es['ident'] == "mjtqtuyhwfdr7j2c3l36uor7uy"
+ assert es['file_count'] == 0
+ assert es['fileset_count'] == 0
+ assert es['webcapture_count'] == 1
+
+ assert es['preservation'] == "bright"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == True
+ assert es['in_dweb'] == False
+ assert es['in_ia'] == True
+ assert es['in_ia_sim'] == False
+ assert es['in_kbart'] == False
+ assert es['in_jstor'] == False
def test_elasticsearch_container_transform(journal_metadata_importer):
with open('tests/files/journal_metadata.sample.json', 'r') as f:
@@ -164,9 +221,17 @@ def test_elasticsearch_release_kbart_year():
)
es = release_to_elasticsearch(r)
assert es['release_year'] == this_year
+
+ assert es['preservation'] == "none"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == None
+ assert es['in_web'] == False
+ assert es['in_dweb'] == False
assert es['in_ia'] == False
+ assert es['in_ia_sim'] == False
assert es['in_kbart'] == False
- assert es['preservation'] == "none"
+ assert es['in_jstor'] == False
r.container = ContainerEntity(
name="dummy journal",
@@ -180,6 +245,14 @@ def test_elasticsearch_release_kbart_year():
)
es = release_to_elasticsearch(r)
assert es['release_year'] == this_year
+
+ assert es['preservation'] == "dark"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == False
+ assert es['in_dweb'] == False
assert es['in_ia'] == False
+ assert es['in_ia_sim'] == False
assert es['in_kbart'] == True
- assert es['preservation'] == "dark"
+ assert es['in_jstor'] == False