summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/elasticsearch/README.md7
-rw-r--r--extra/elasticsearch/changelog_schema.json29
-rw-r--r--extra/elasticsearch/container_schema.json68
-rw-r--r--extra/elasticsearch/file_schema.json59
-rw-r--r--extra/elasticsearch/release_schema.json101
-rw-r--r--extra/stats/2020-02-19-prod-stats.json1
-rw-r--r--extra/stats/2020-02-19-prod-table-sizes.txt46
-rw-r--r--proposals/2020_elasticsearch_schemas.md12
-rw-r--r--proposals/2020_sql_size_reduction.md16
-rw-r--r--python/Pipfile7
-rw-r--r--python/Pipfile.lock490
-rwxr-xr-xpython/fatcat_import.py15
-rwxr-xr-xpython/fatcat_ingest.py5
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/common.py9
-rw-r--r--python/fatcat_tools/importers/ingest.py14
-rw-r--r--python/fatcat_tools/importers/shadow.py195
-rw-r--r--python/fatcat_tools/transforms/__init__.py2
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py242
-rw-r--r--python/fatcat_tools/workers/changelog.py40
-rwxr-xr-xpython/fatcat_transform.py34
-rw-r--r--python/fatcat_web/entity_helpers.py4
-rw-r--r--python/fatcat_web/search.py4
-rw-r--r--python/fatcat_web/templates/release_view.html5
-rw-r--r--python/tests/files/changelog_3469683.json1
-rw-r--r--python/tests/files/example_shadow.json10
-rw-r--r--python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json1
-rw-r--r--python/tests/files/release_etodop5banbndg3faecnfm6ozi.json1
-rw-r--r--python/tests/import_shadow.py61
-rw-r--r--python/tests/transform_elasticsearch.py (renamed from python/tests/transform_tests.py)62
30 files changed, 1183 insertions, 359 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index 3a48a178..17865bc0 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -40,9 +40,11 @@ Drop and rebuild the schema:
http delete :9200/fatcat_release
http delete :9200/fatcat_container
+ http delete :9200/fatcat_file
http delete :9200/fatcat_changelog
http put :9200/fatcat_release < release_schema.json
http put :9200/fatcat_container < container_schema.json
+ http put :9200/fatcat_file < file_schema.json
http put :9200/fatcat_changelog < changelog_schema.json
Put a single object (good for debugging):
@@ -57,8 +59,9 @@ Bulk insert from a file on disk:
Or, in a bulk production live-stream conversion:
export LC_ALL=C.UTF-8
- time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_release -type release
- time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_container -type container
+ time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type release
+ time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type container
+ time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type file
## Index Aliases
diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json
index f3211e99..d8342549 100644
--- a/extra/elasticsearch/changelog_schema.json
+++ b/extra/elasticsearch/changelog_schema.json
@@ -8,6 +8,18 @@
"tokenizer": "standard",
"filter": [ "lowercase", "asciifolding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -16,20 +28,29 @@
"changelog": {
"properties": {
"index": { "type": "integer" },
- "editgroup_id": { "type": "keyword" },
+ "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"timestamp": { "type": "date" },
- "editor_id": { "type": "keyword" },
- "username": { "type": "keyword" },
+ "editor_id": { "type": "keyword", "normalizer": "default" },
+ "username": { "type": "keyword", "normalizer": "caseSensitive" },
"is_bot": { "type": "boolean" },
"is_admin": { "type": "boolean" },
- "agent": { "type": "keyword" },
+ "agent": { "type": "keyword", "normalizer": "caseSensitive" },
+
"containers": { "type": "integer" },
+ "new_containers": { "type": "integer" },
"creators": { "type": "integer" },
+ "new_creators": { "type": "integer" },
"files": { "type": "integer" },
+ "new_files": { "type": "integer" },
"filessets": { "type": "integer" },
+ "new_filessets": { "type": "integer" },
"webcaptures": { "type": "integer" },
+ "new_webcaptures": { "type": "integer" },
"releases": { "type": "integer" },
+ "new_releases": { "type": "integer" },
"works": { "type": "integer" },
+ "new_works": { "type": "integer" },
+
"created": { "type": "integer" },
"updated": { "type": "integer" },
"deleted": { "type": "integer" },
diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json
index b0a47e85..5cd85b04 100644
--- a/extra/elasticsearch/container_schema.json
+++ b/extra/elasticsearch/container_schema.json
@@ -20,6 +20,18 @@
"char_filter": [ "icu_normalizer" ],
"filter": [ "icu_folding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -27,43 +39,51 @@
"mappings": {
"container": {
"properties": {
- "ident": { "type": "keyword" },
- "state": { "type": "keyword" },
- "revision": { "type": "keyword" },
- "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_type": { "type": "keyword" },
- "issnl": { "type": "keyword" },
- "wikidata_qid": { "type": "keyword" },
- "country": { "type": "keyword" },
- "region": { "type": "keyword" },
- "discipline": { "type": "keyword" },
- "languages": { "type": "keyword" },
- "mimetypes": { "type": "keyword" },
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "publisher_type": { "type": "keyword", "normalizer": "default" },
+ "container_type": { "type": "keyword", "normalizer": "default" },
+ "issnl": { "type": "keyword", "normalizer": "default" },
+ "issns": { "type": "keyword", "normalizer": "default" },
+ "wikidata_qid": { "type": "keyword", "normalizer": "default" },
+ "country_code": { "type": "keyword", "normalizer": "default" },
+ "region": { "type": "keyword", "normalizer": "default" },
+ "discipline": { "type": "keyword", "normalizer": "default" },
+ "languages": { "type": "keyword", "normalizer": "default" },
+ "mimetypes": { "type": "keyword", "normalizer": "default" },
"first_year": { "type": "integer" },
"last_year": { "type": "integer" },
- "in_doaj": { "type": "boolean" },
- "in_road": { "type": "boolean" },
- "in_doi": { "type": "boolean" },
- "in_sherpa_romeo":{ "type": "boolean" },
- "is_oa": { "type": "boolean" },
- "is_longtail_oa": { "type": "boolean" },
- "any_kbart": { "type": "boolean" },
- "any_jstor": { "type": "boolean" },
- "any_ia_sim": { "type": "boolean" },
+
+ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+ "in_doaj": { "type": "boolean" },
+ "in_road": { "type": "boolean" },
+ "is_oa": { "type": "boolean" },
+ "is_longtail_oa": { "type": "boolean" },
+ "any_kbart": { "type": "boolean" },
+ "any_jstor": { "type": "boolean" },
+ "any_ia_sim": { "type": "boolean" },
+ "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" },
"releases_total": { "type": "integer" },
"releases_kbart": { "type": "integer" },
"releases_ia": { "type": "integer" },
- "releases_sim": { "type": "integer" },
- "releases_shadow": { "type": "integer" },
+ "releases_ia_sim": { "type": "integer" },
+ "releases_shadows": { "type": "integer" },
"releases_any_file": { "type": "integer" },
"releases_any_fileset": { "type": "integer" },
"releases_any_webcapture": { "type": "integer" },
"year": { "type": "alias", "path": "first_year" },
"type": { "type": "alias", "path": "container_type" },
+ "issn": { "type": "alias", "path": "issns" },
"oa": { "type": "alias", "path": "is_oa" },
"longtail": { "type": "alias", "path": "is_longtail_oa" }
}
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
new file mode 100644
index 00000000..9c8ee64c
--- /dev/null
+++ b/extra/elasticsearch/file_schema.json
@@ -0,0 +1,59 @@
+{
+"settings": {
+ "index": {
+ "analysis": {
+ "analyzer": {
+ "default": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [ "lowercase", "asciifolding" ]
+ }
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
+ }
+ }
+ }
+},
+"mappings": {
+ "file": {
+ "properties": {
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
+
+ "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "release_count": { "type": "integer" },
+ "mimetype": { "type": "keyword", "normalizer": "default" },
+ "size_bytes": { "type": "integer" },
+ "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "md5": { "type": "keyword", "normalizer": "default", "doc_values": false },
+
+ "domains": { "type": "keyword", "normalizer": "default" },
+ "hosts": { "type": "keyword", "normalizer": "default" },
+ "rels": { "type": "keyword", "normalizer": "default" },
+ "in_ia": { "type": "boolean" },
+ "in_ia_petabox": { "type": "boolean" },
+
+ "release_id": { "type": "alias", "path": "release_ids" },
+ "sha1hex": { "type": "alias", "path": "sha1" },
+ "sha256hex": { "type": "alias", "path": "sha256" },
+ "md5hex": { "type": "alias", "path": "md5" },
+ "size": { "type": "alias", "path": "size_bytes" },
+ "domain": { "type": "alias", "path": "domains" },
+ "host": { "type": "alias", "path": "hosts" },
+ "rel": { "type": "alias", "path": "rels" }
+ }
+ }
+}
+}
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 85026060..666a672f 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -20,6 +20,18 @@
"char_filter": [ "icu_normalizer" ],
"filter": [ "icu_folding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -27,48 +39,66 @@
"mappings": {
"release": {
"properties": {
- "ident": { "type": "keyword" },
- "state": { "type": "keyword" },
- "revision": { "type": "keyword" },
- "work_id": { "type": "keyword" },
- "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "work_id": { "type": "keyword", "normalizer": "default" },
+ "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"release_date": { "type": "date" },
- "release_year": { "type": "integer" },
- "release_type": { "type": "keyword" },
- "release_stage": { "type": "keyword" },
- "withdrawn_status": { "type": "keyword" },
- "language": { "type": "keyword" },
- "doi": { "type": "keyword" },
- "pmid": { "type": "keyword" },
- "pmcid": { "type": "keyword" },
- "isbn13": { "type": "keyword" },
- "wikidata_qid": { "type": "keyword" },
- "core_id": { "type": "keyword" },
- "axiv_id": { "type": "keyword" },
- "jstor_id": { "type": "keyword" },
- "ark_id": { "type": "keyword" },
- "mag_id": { "type": "keyword" },
- "license": { "type": "keyword" },
+ "release_year": { "type": "integer", "copy_to": "biblio" },
+ "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "release_stage": { "type": "keyword", "normalizer": "default" },
+ "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "language": { "type": "keyword", "normalizer": "default" },
+ "country_code": { "type": "keyword", "normalizer": "default" },
+ "country_code_upper": { "type": "keyword", "normalizer": "caseSensitive" },
+ "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "first_page": { "type": "keyword", "normalizer": "default" },
+ "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "doi": { "type": "keyword", "normalizer": "default" },
+ "doi_prefix": { "type": "keyword", "normalizer": "default" },
+ "doi_registrar": { "type": "keyword", "normalizer": "default" },
+ "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "license": { "type": "keyword", "normalizer": "default" },
"publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_id": { "type": "keyword" },
- "container_issnl": { "type": "keyword" },
- "container_type": { "type": "keyword" },
+ "publisher_type": { "type": "keyword", "normalizer": "default" },
+ "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "container_id": { "type": "keyword", "normalizer": "default" },
+ "container_issnl": { "type": "keyword", "normalizer": "default" },
+ "container_type": { "type": "keyword", "normalizer": "default" },
"contrib_count": { "type": "integer" },
- "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "creator_ids": { "type": "keyword" },
+ "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "affiliation_rors": { "type": "keyword", "normalizer": "default" },
+ "creator_ids": { "type": "keyword", "normalizer": "default" },
"ref_count": { "type": "integer" },
"ref_linked_count": { "type": "integer" },
+ "ref_release_ids": { "type": "keyword", "normalizer": "default" },
"file_count": { "type": "integer" },
"fileset_count": { "type": "integer" },
"webcapture_count": { "type": "integer" },
"any_abstract": { "type": "boolean" },
- "best_pdf_url": { "type": "keyword" },
- "ia_pdf_url": { "type": "keyword" },
+ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+ "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
+ "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
+ "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
"is_oa": { "type": "boolean" },
+ "oa_color": { "type": "keyword", "normalizer": "default" },
"is_longtail_oa": { "type": "boolean" },
"is_preserved": { "type": "boolean" },
"in_kbart": { "type": "boolean" },
@@ -79,7 +109,13 @@
"in_ia_sim": { "type": "boolean" },
"in_shadows": { "type": "boolean" },
"is_superceded": { "type": "boolean" },
+ "is_retracted": { "type": "boolean" },
+ "preservation": { "type": "keyword", "normalizer": "default" },
+ "affiliation": { "type": "alias", "path": "affiliations" },
+ "ror": { "type": "alias", "path": "affiliation_rors" },
+ "creator_id": { "type": "alias", "path": "creator_ids" },
+ "ref_release_id": { "type": "alias", "path": "ref_release_ids" },
"author": { "type": "alias", "path": "contrib_names" },
"journal": { "type": "alias", "path": "container_name" },
"date": { "type": "alias", "path": "release_date" },
@@ -90,6 +126,9 @@
"lang": { "type": "alias", "path": "language" },
"file_pdf_url": { "type": "alias", "path": "best_pdf_url" },
"release_status": { "type": "alias", "path": "release_stage" },
+ "stage": { "type": "alias", "path": "release_stage" },
+ "type": { "type": "alias", "path": "release_type" },
+ "retracted": { "type": "alias", "path": "is_retracted" },
"is_kept": { "type": "alias", "path": "in_kbart" }
}
}
diff --git a/extra/stats/2020-02-19-prod-stats.json b/extra/stats/2020-02-19-prod-stats.json
new file mode 100644
index 00000000..a2313233
--- /dev/null
+++ b/extra/stats/2020-02-19-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":3509511,"timestamp":"2020-02-20T01:42:50.980212+00:00"}},"container":{"total":148356},"papers":{"in_kbart":60523853,"in_web":19616767,"in_web_not_kbart":8937938,"is_oa":11524180,"total":105665352},"release":{"refs_total":889522285,"total":143709455}}
diff --git a/extra/stats/2020-02-19-prod-table-sizes.txt b/extra/stats/2020-02-19-prod-table-sizes.txt
new file mode 100644
index 00000000..cab2b52e
--- /dev/null
+++ b/extra/stats/2020-02-19-prod-table-sizes.txt
@@ -0,0 +1,46 @@
+Size: 476.74G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 53 GB | 43 GB | 96 GB
+ "public"."release_rev" | 58 GB | 33 GB | 91 GB
+ "public"."refs_blob" | 85 GB | 2884 MB | 88 GB
+ "public"."release_edit" | 14 GB | 20 GB | 34 GB
+ "public"."work_edit" | 13 GB | 20 GB | 34 GB
+ "public"."release_ident" | 9504 MB | 15 GB | 24 GB
+ "public"."work_ident" | 9302 MB | 15 GB | 24 GB
+ "public"."abstracts" | 16 GB | 1501 MB | 18 GB
+ "public"."file_rev_url" | 9980 MB | 3550 MB | 13 GB
+ "public"."work_rev" | 6038 MB | 5825 MB | 12 GB
+ "public"."release_ref" | 3997 MB | 5690 MB | 9686 MB
+ "public"."file_rev" | 3472 MB | 5103 MB | 8574 MB
+ "public"."file_edit" | 2934 MB | 3959 MB | 6893 MB
+ "public"."release_rev_abstract" | 2402 MB | 3339 MB | 5742 MB
+ "public"."file_ident" | 1795 MB | 2437 MB | 4231 MB
+ "public"."file_rev_release" | 1651 MB | 2428 MB | 4078 MB
+ "public"."creator_edit" | 702 MB | 942 MB | 1643 MB
+ "public"."creator_rev" | 695 MB | 719 MB | 1413 MB
+ "public"."editgroup" | 761 MB | 404 MB | 1164 MB
+ "public"."creator_ident" | 474 MB | 648 MB | 1121 MB
+ "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB
+ "public"."changelog" | 218 MB | 214 MB | 432 MB
+ "public"."container_rev" | 75 MB | 23 MB | 98 MB
+ "public"."container_edit" | 25 MB | 31 MB | 56 MB
+ "public"."container_ident" | 11 MB | 19 MB | 30 MB
+ "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."auth_oidc" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."editor" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB
+ "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md
index 83db884f..c3e79073 100644
--- a/proposals/2020_elasticsearch_schemas.md
+++ b/proposals/2020_elasticsearch_schemas.md
@@ -14,8 +14,6 @@ Simple additions:
- pages
- `first_page` (parsed from pages) (?)
- number
-- `in_shadow`
-- OA license slug (?)
- `doi_prefix`
- `doi_registrar` (based on extra)
- `first_author` (surname; for matching)
@@ -25,6 +23,8 @@ Simple additions:
- referenced releases idents
- contrib creator idents
+Add affiliations, both as raw strings and ROR identifiers.
+
## Preservation Summary Field
@@ -33,8 +33,8 @@ status (from `in_kbart`, `in_ia`, etc) to a `preservation_status` flag which
is:
- `bright`
-- `dark_only`
-- `shadow_only`
+- `dark`
+- `shadows_only`
- `none`
Note that these don't align with OA color or work-level preservation (aka, no
@@ -128,8 +128,8 @@ hit does not}").
## Container Fields
-- `all_issns`
-- `release_count`
+- `issn` (all issns)
+- `original_name`
The `release_count` would not be indexed (left null) by default, and would be
"patched" in to entities by a separate script (periodically?).
diff --git a/proposals/2020_sql_size_reduction.md b/proposals/2020_sql_size_reduction.md
index f421e455..2fa39873 100644
--- a/proposals/2020_sql_size_reduction.md
+++ b/proposals/2020_sql_size_reduction.md
@@ -52,6 +52,8 @@ Other growth is expected to be much smaller, let's say a few GB of disk.
This works out to a bit over 600 GByte total disk size.
+NOTE: math was wrong? 470 + 80 + 100 -> 650 GByte, call it 700 GByte
+
## Idea: finish `ext_id` migration and drop columns+index from `release_rev`
@@ -172,3 +174,17 @@ would drop ~20% of data size and ~20% of index size.
Would it make more sense to use {ident, editgroup} as the primary key and UNIQ,
then have a separate index on `editgroup`? On the assumption that `editgroup`
cardinality is much smaller, thus the index disk usage would be smaller.
+
+## Idea: use binary for hashes
+
+We currently store file hashes (SHA-1, SHA-256, MD5) and abstracts/`ref_blobs`
+keys as TEXT in lower-case hex encoding. Using binary instead could be as much
+as a 50% size savings for both column and index storage. The difference becomes
+more apparent when all files have all hashes populated.
+
+base32 encoded strings would be smaller (but non-negligable) savings.
+
+This change has a reasonable migration path, is entirely internal to postgres
+and fatcatd, and would be no change to API schema. Postgres also allows `hex`
+encoding on `bytea` data type, which can make reading/debugging reasonable.
+
diff --git a/python/Pipfile b/python/Pipfile
index 1a19a145..a2ce14e1 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -30,6 +30,8 @@ Flask-Misaka = "*"
flask-mwoauth = "*"
WTForms = "*"
loginpass = ">=0.4"
+# loginpass 0.4 is not actually compatible with newer authlib
+authlib = "<0.13"
requests = ">=2"
raven = {extras = ['flask'],version = "*"}
pykafka = ">=2"
@@ -51,6 +53,11 @@ dateparser = ">=0.7"
langdetect = "*"
pathlib2 = "*"
pycountry = "*"
+tldextract = "*"
+
+# this is only to lock to a python3.5-compatible version. needed by an
+# importlib-metadata, under pytest
+zipp = "<2.0.0"
[requires]
# Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index d813dd32..0b24eb88 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "edac46fb8a4b49aea89946621337abba881fff6f6768107d2529c752bcf702ac"
+ "sha256": "03588fffd1f322432b0e3d1197d0797a300b38f42aa84b4e1d5909e9a3d5a07a"
},
"pipfile-spec": 6,
"requires": {
@@ -18,10 +18,11 @@
"default": {
"authlib": {
"hashes": [
- "sha256:401a4a184379a6551b79af30243ce1ad56da0a8da5b4925e0be260f07ee40009",
- "sha256:e9a6a6c5748109bd9ce9f24e71d293156c43bd0d6595bc4e63dd8f5b9e809149"
+ "sha256:40728195efe915c96209a6a57ad6b5565a951bb469f01bd0c38ea13a0bac6c68",
+ "sha256:4b6a82217b376fe5e7a9e5348865591077c2e6e5d987f4741ac74975b75aa145"
],
- "version": "==0.13"
+ "index": "pypi",
+ "version": "==0.12.1"
},
"beautifulsoup4": {
"hashes": [
@@ -53,41 +54,36 @@
},
"cffi": {
"hashes": [
- "sha256:0b49274afc941c626b605fb59b59c3485c17dc776dc3cc7cc14aca74cc19cc42",
- "sha256:0e3ea92942cb1168e38c05c1d56b0527ce31f1a370f6117f1d490b8dcd6b3a04",
- "sha256:135f69aecbf4517d5b3d6429207b2dff49c876be724ac0c8bf8e1ea99df3d7e5",
- "sha256:19db0cdd6e516f13329cba4903368bff9bb5a9331d3410b1b448daaadc495e54",
- "sha256:2781e9ad0e9d47173c0093321bb5435a9dfae0ed6a762aabafa13108f5f7b2ba",
- "sha256:291f7c42e21d72144bb1c1b2e825ec60f46d0a7468f5346841860454c7aa8f57",
- "sha256:2c5e309ec482556397cb21ede0350c5e82f0eb2621de04b2633588d118da4396",
- "sha256:2e9c80a8c3344a92cb04661115898a9129c074f7ab82011ef4b612f645939f12",
- "sha256:32a262e2b90ffcfdd97c7a5e24a6012a43c61f1f5a57789ad80af1d26c6acd97",
- "sha256:3c9fff570f13480b201e9ab69453108f6d98244a7f495e91b6c654a47486ba43",
- "sha256:415bdc7ca8c1c634a6d7163d43fb0ea885a07e9618a64bda407e04b04333b7db",
- "sha256:42194f54c11abc8583417a7cf4eaff544ce0de8187abaf5d29029c91b1725ad3",
- "sha256:4424e42199e86b21fc4db83bd76909a6fc2a2aefb352cb5414833c030f6ed71b",
- "sha256:4a43c91840bda5f55249413037b7a9b79c90b1184ed504883b72c4df70778579",
- "sha256:599a1e8ff057ac530c9ad1778293c665cb81a791421f46922d80a86473c13346",
- "sha256:5c4fae4e9cdd18c82ba3a134be256e98dc0596af1e7285a3d2602c97dcfa5159",
- "sha256:5ecfa867dea6fabe2a58f03ac9186ea64da1386af2159196da51c4904e11d652",
- "sha256:62f2578358d3a92e4ab2d830cd1c2049c9c0d0e6d3c58322993cc341bdeac22e",
- "sha256:6471a82d5abea994e38d2c2abc77164b4f7fbaaf80261cb98394d5793f11b12a",
- "sha256:6d4f18483d040e18546108eb13b1dfa1000a089bcf8529e30346116ea6240506",
- "sha256:71a608532ab3bd26223c8d841dde43f3516aa5d2bf37b50ac410bb5e99053e8f",
- "sha256:74a1d8c85fb6ff0b30fbfa8ad0ac23cd601a138f7509dc617ebc65ef305bb98d",
- "sha256:7b93a885bb13073afb0aa73ad82059a4c41f4b7d8eb8368980448b52d4c7dc2c",
- "sha256:7d4751da932caaec419d514eaa4215eaf14b612cff66398dd51129ac22680b20",
- "sha256:7f627141a26b551bdebbc4855c1157feeef18241b4b8366ed22a5c7d672ef858",
- "sha256:8169cf44dd8f9071b2b9248c35fc35e8677451c52f795daa2bb4643f32a540bc",
- "sha256:aa00d66c0fab27373ae44ae26a66a9e43ff2a678bf63a9c7c1a9a4d61172827a",
- "sha256:ccb032fda0873254380aa2bfad2582aedc2959186cce61e3a17abc1a55ff89c3",
- "sha256:d754f39e0d1603b5b24a7f8484b22d2904fa551fe865fd0d4c3332f078d20d4e",
- "sha256:d75c461e20e29afc0aee7172a0950157c704ff0dd51613506bd7d82b718e7410",
- "sha256:dcd65317dd15bc0451f3e01c80da2216a31916bdcffd6221ca1202d96584aa25",
- "sha256:e570d3ab32e2c2861c4ebe6ffcad6a8abf9347432a37608fe1fbd157b3f0036b",
- "sha256:fd43a88e045cf992ed09fa724b5315b790525f2676883a6ea64e3263bae6549d"
- ],
- "version": "==1.13.2"
+ "sha256:001bf3242a1bb04d985d63e138230802c6c8d4db3668fb545fb5005ddf5bb5ff",
+ "sha256:00789914be39dffba161cfc5be31b55775de5ba2235fe49aa28c148236c4e06b",
+ "sha256:028a579fc9aed3af38f4892bdcc7390508adabc30c6af4a6e4f611b0c680e6ac",
+ "sha256:14491a910663bf9f13ddf2bc8f60562d6bc5315c1f09c704937ef17293fb85b0",
+ "sha256:1cae98a7054b5c9391eb3249b86e0e99ab1e02bb0cc0575da191aedadbdf4384",
+ "sha256:2089ed025da3919d2e75a4d963d008330c96751127dd6f73c8dc0c65041b4c26",
+ "sha256:2d384f4a127a15ba701207f7639d94106693b6cd64173d6c8988e2c25f3ac2b6",
+ "sha256:337d448e5a725bba2d8293c48d9353fc68d0e9e4088d62a9571def317797522b",
+ "sha256:399aed636c7d3749bbed55bc907c3288cb43c65c4389964ad5ff849b6370603e",
+ "sha256:3b911c2dbd4f423b4c4fcca138cadde747abdb20d196c4a48708b8a2d32b16dd",
+ "sha256:3d311bcc4a41408cf5854f06ef2c5cab88f9fded37a3b95936c9879c1640d4c2",
+ "sha256:62ae9af2d069ea2698bf536dcfe1e4eed9090211dbaafeeedf5cb6c41b352f66",
+ "sha256:66e41db66b47d0d8672d8ed2708ba91b2f2524ece3dee48b5dfb36be8c2f21dc",
+ "sha256:675686925a9fb403edba0114db74e741d8181683dcf216be697d208857e04ca8",
+ "sha256:7e63cbcf2429a8dbfe48dcc2322d5f2220b77b2e17b7ba023d6166d84655da55",
+ "sha256:8a6c688fefb4e1cd56feb6c511984a6c4f7ec7d2a1ff31a10254f3c817054ae4",
+ "sha256:8c0ffc886aea5df6a1762d0019e9cb05f825d0eec1f520c51be9d198701daee5",
+ "sha256:95cd16d3dee553f882540c1ffe331d085c9e629499ceadfbda4d4fde635f4b7d",
+ "sha256:99f748a7e71ff382613b4e1acc0ac83bf7ad167fb3802e35e90d9763daba4d78",
+ "sha256:b8c78301cefcf5fd914aad35d3c04c2b21ce8629b5e4f4e45ae6812e461910fa",
+ "sha256:c420917b188a5582a56d8b93bdd8e0f6eca08c84ff623a4c16e809152cd35793",
+ "sha256:c43866529f2f06fe0edc6246eb4faa34f03fe88b64a0a9a942561c8e22f4b71f",
+ "sha256:cab50b8c2250b46fe738c77dbd25ce017d5e6fb35d3407606e7a4180656a5a6a",
+ "sha256:cef128cb4d5e0b3493f058f10ce32365972c554572ff821e175dbc6f8ff6924f",
+ "sha256:cf16e3cf6c0a5fdd9bc10c21687e19d29ad1fe863372b5543deaec1039581a30",
+ "sha256:e56c744aa6ff427a607763346e4170629caf7e48ead6921745986db3692f987f",
+ "sha256:e577934fc5f8779c554639376beeaa5657d54349096ef24abe8c74c5d9c117c3",
+ "sha256:f2b0fa0c01d8a0c7483afd9f31d7ecf2d71760ca24499c8697aeb5ca37dc090c"
+ ],
+ "version": "==1.14.0"
},
"chardet": {
"hashes": [
@@ -98,11 +94,10 @@
},
"citeproc-py": {
"hashes": [
- "sha256:ed513dbc76f782b5e98126d6bebbd1284841fcf199ec9dda552e2bce864adadf",
- "sha256:ee1569e8b1f2c057c7893bbb067986bc362a6c861009d310700a12a1a0107b57"
+ "sha256:00f8919766fb07ddc6ace5e5b88282809ecae738b031bd32edeb762a31367113"
],
"index": "pypi",
- "version": "==0.4.0"
+ "version": "==0.5.0"
},
"citeproc-py-styles": {
"hashes": [
@@ -220,21 +215,23 @@
},
"flask-debugtoolbar": {
"hashes": [
- "sha256:3d9657bc0c3633ace429e3ff451742bb59d1b7a7b95c9eb23a65ac9be2812959",
- "sha256:ec810083123aae0632eb32ba11e1cb4cdace81e7ce6c5009dd06c5204afbce52"
+ "sha256:0e9a80d4c599233c68376e81cc99976200b5ac5248cfb24f18935cc5b69ac5b3",
+ "sha256:3c4e79d354ede014e6657c545a536d4fb273cc89e3fd6b4835b02e346dd3aab4"
],
"index": "pypi",
- "version": "==0.10.1"
+ "version": "==0.11.0"
},
"flask-login": {
"hashes": [
- "sha256:c815c1ac7b3e35e2081685e389a665f2c74d7e077cb93cecabaea352da4752ec"
+ "sha256:6d33aef15b5bcead780acc339464aae8a6e28f13c90d8b1cf9de8b549d1c0b4b",
+ "sha256:7451b5001e17837ba58945aead261ba425fdf7b4f0448777e597ddab39f4fba0"
],
"index": "pypi",
- "version": "==0.4.1"
+ "version": "==0.5.0"
},
"flask-misaka": {
"hashes": [
+ "sha256:bcfdacc0803ccea75d377737e82c83489b2153d922c9d9f9eabc5148d216ed70",
"sha256:d0cfb0efd9e5afacda76defd4a605a68390f4fb1bef283c71534fd3ce0d3efb5",
"sha256:f423c3beb5502742a57330a272f81d53223f6f99d45cc45b03926e3a3034f589"
],
@@ -257,18 +254,18 @@
},
"flask-wtf": {
"hashes": [
- "sha256:5d14d55cfd35f613d99ee7cba0fc3fbbe63ba02f544d349158c14ca15561cc36",
- "sha256:d9a9e366b32dcbb98ef17228e76be15702cd2600675668bca23f63a7947fd5ac"
+ "sha256:57b3faf6fe5d6168bda0c36b0df1d05770f8e205e18332d0376ddb954d17aef2",
+ "sha256:d417e3a0008b5ba583da1763e4db0f55a1269d9dd91dcc3eb3c026d3c5dbd720"
],
"index": "pypi",
- "version": "==0.14.2"
+ "version": "==0.14.3"
},
"ftfy": {
"hashes": [
- "sha256:6d7509c45e602dec890f0f6ee0623a8b5f50ec1188ac7ab9535e18e572c99bcc"
+ "sha256:67f9c8b33a4b742376a3eda11b0e3bd5c0cbe719d95ea0bfd3736a7bdd1c24c8"
],
"index": "pypi",
- "version": "==5.6"
+ "version": "==5.7"
},
"future": {
"hashes": [
@@ -278,10 +275,10 @@
},
"idna": {
"hashes": [
- "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
- "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
+ "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb",
+ "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"
],
- "version": "==2.8"
+ "version": "==2.9"
},
"itsdangerous": {
"hashes": [
@@ -292,10 +289,10 @@
},
"jinja2": {
"hashes": [
- "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f",
- "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de"
+ "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250",
+ "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49"
],
- "version": "==2.10.3"
+ "version": "==2.11.1"
},
"kazoo": {
"hashes": [
@@ -321,34 +318,35 @@
},
"lxml": {
"hashes": [
- "sha256:00ac0d64949fef6b3693813fe636a2d56d97a5a49b5bbb86e4cc4cc50ebc9ea2",
- "sha256:0571e607558665ed42e450d7bf0e2941d542c18e117b1ebbf0ba72f287ad841c",
- "sha256:0e3f04a7615fdac0be5e18b2406529521d6dbdb0167d2a690ee328bef7807487",
- "sha256:13cf89be53348d1c17b453867da68704802966c433b2bb4fa1f970daadd2ef70",
- "sha256:217262fcf6a4c2e1c7cb1efa08bd9ebc432502abc6c255c4abab611e8be0d14d",
- "sha256:223e544828f1955daaf4cefbb4853bc416b2ec3fd56d4f4204a8b17007c21250",
- "sha256:277cb61fede2f95b9c61912fefb3d43fbd5f18bf18a14fae4911b67984486f5d",
- "sha256:3213f753e8ae86c396e0e066866e64c6b04618e85c723b32ecb0909885211f74",
- "sha256:4690984a4dee1033da0af6df0b7a6bde83f74e1c0c870623797cec77964de34d",
- "sha256:4fcc472ef87f45c429d3b923b925704aa581f875d65bac80f8ab0c3296a63f78",
- "sha256:61409bd745a265a742f2693e4600e4dbd45cc1daebe1d5fad6fcb22912d44145",
- "sha256:678f1963f755c5d9f5f6968dded7b245dd1ece8cf53c1aa9d80e6734a8c7f41d",
- "sha256:6c6d03549d4e2734133badb9ab1c05d9f0ef4bcd31d83e5d2b4747c85cfa21da",
- "sha256:6e74d5f4d6ecd6942375c52ffcd35f4318a61a02328f6f1bd79fcb4ffedf969e",
- "sha256:7b4fc7b1ecc987ca7aaf3f4f0e71bbfbd81aaabf87002558f5bc95da3a865bcd",
- "sha256:7ed386a40e172ddf44c061ad74881d8622f791d9af0b6f5be20023029129bc85",
- "sha256:8f54f0924d12c47a382c600c880770b5ebfc96c9fd94cf6f6bdc21caf6163ea7",
- "sha256:ad9b81351fdc236bda538efa6879315448411a81186c836d4b80d6ca8217cdb9",
- "sha256:bbd00e21ea17f7bcc58dccd13869d68441b32899e89cf6cfa90d624a9198ce85",
- "sha256:c3c289762cc09735e2a8f8a49571d0e8b4f57ea831ea11558247b5bdea0ac4db",
- "sha256:cf4650942de5e5685ad308e22bcafbccfe37c54aa7c0e30cd620c2ee5c93d336",
- "sha256:cfcbc33c9c59c93776aa41ab02e55c288a042211708b72fdb518221cc803abc8",
- "sha256:e301055deadfedbd80cf94f2f65ff23126b232b0d1fea28f332ce58137bcdb18",
- "sha256:ebbfe24df7f7b5c6c7620702496b6419f6a9aa2fd7f005eb731cc80d7b4692b9",
- "sha256:eff69ddbf3ad86375c344339371168640951c302450c5d3e9936e98d6459db06",
- "sha256:f6ed60a62c5f1c44e789d2cf14009423cb1646b44a43e40a9cf6a21f077678a1"
- ],
- "version": "==4.4.2"
+ "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd",
+ "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c",
+ "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081",
+ "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f",
+ "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261",
+ "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a",
+ "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9",
+ "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a",
+ "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb",
+ "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60",
+ "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128",
+ "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a",
+ "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717",
+ "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89",
+ "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72",
+ "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8",
+ "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3",
+ "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7",
+ "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8",
+ "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77",
+ "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1",
+ "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15",
+ "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679",
+ "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012",
+ "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6",
+ "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc",
+ "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca"
+ ],
+ "version": "==4.5.0"
},
"markupsafe": {
"hashes": [
@@ -356,13 +354,16 @@
"sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
"sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
"sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
+ "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
"sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
"sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
"sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
"sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
"sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
"sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
+ "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
"sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
+ "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
"sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
"sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
"sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
@@ -379,7 +380,9 @@
"sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
"sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
"sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
- "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"
+ "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
+ "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
+ "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
],
"version": "==1.1.1"
},
@@ -442,6 +445,7 @@
},
"pykafka": {
"hashes": [
+ "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459",
"sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577"
],
"index": "pypi",
@@ -498,11 +502,11 @@
},
"python-dotenv": {
"hashes": [
- "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093",
- "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544"
+ "sha256:8429f459fc041237d98c9ff32e1938e7e5535b5ff24388876315a098027c3a57",
+ "sha256:ca9f3debf2262170d6f46571ce4d6ca1add60bb93b69c3a29dcb3d1a00a65c93"
],
"index": "pypi",
- "version": "==0.10.3"
+ "version": "==0.11.0"
},
"python-magic": {
"hashes": [
@@ -528,9 +532,6 @@
"version": "==2019.3"
},
"raven": {
- "extras": [
- "flask"
- ],
"hashes": [
"sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
"sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
@@ -540,66 +541,74 @@
},
"regex": {
"hashes": [
- "sha256:07b39bf943d3d2fe63d46281d8504f8df0ff3fe4c57e13d1656737950e53e525",
- "sha256:0932941cdfb3afcbc26cc3bcf7c3f3d73d5a9b9c56955d432dbf8bbc147d4c5b",
- "sha256:0e182d2f097ea8549a249040922fa2b92ae28be4be4895933e369a525ba36576",
- "sha256:10671601ee06cf4dc1bc0b4805309040bb34c9af423c12c379c83d7895622bb5",
- "sha256:23e2c2c0ff50f44877f64780b815b8fd2e003cda9ce817a7fd00dea5600c84a0",
- "sha256:26ff99c980f53b3191d8931b199b29d6787c059f2e029b2b0c694343b1708c35",
- "sha256:27429b8d74ba683484a06b260b7bb00f312e7c757792628ea251afdbf1434003",
- "sha256:3e77409b678b21a056415da3a56abfd7c3ad03da71f3051bbcdb68cf44d3c34d",
- "sha256:4e8f02d3d72ca94efc8396f8036c0d3bcc812aefc28ec70f35bb888c74a25161",
- "sha256:4eae742636aec40cf7ab98171ab9400393360b97e8f9da67b1867a9ee0889b26",
- "sha256:6a6ae17bf8f2d82d1e8858a47757ce389b880083c4ff2498dba17c56e6c103b9",
- "sha256:6a6ba91b94427cd49cd27764679024b14a96874e0dc638ae6bdd4b1a3ce97be1",
- "sha256:7bcd322935377abcc79bfe5b63c44abd0b29387f267791d566bbb566edfdd146",
- "sha256:98b8ed7bb2155e2cbb8b76f627b2fd12cf4b22ab6e14873e8641f266e0fb6d8f",
- "sha256:bd25bb7980917e4e70ccccd7e3b5740614f1c408a642c245019cff9d7d1b6149",
- "sha256:d0f424328f9822b0323b3b6f2e4b9c90960b24743d220763c7f07071e0778351",
- "sha256:d58e4606da2a41659c84baeb3cfa2e4c87a74cec89a1e7c56bee4b956f9d7461",
- "sha256:e3cd21cc2840ca67de0bbe4071f79f031c81418deb544ceda93ad75ca1ee9f7b",
- "sha256:e6c02171d62ed6972ca8631f6f34fa3281d51db8b326ee397b9c83093a6b7242",
- "sha256:e7c7661f7276507bce416eaae22040fd91ca471b5b33c13f8ff21137ed6f248c",
- "sha256:ecc6de77df3ef68fee966bb8cb4e067e84d4d1f397d0ef6fce46913663540d77"
- ],
- "version": "==2020.1.8"
+ "sha256:01b2d70cbaed11f72e57c1cfbaca71b02e3b98f739ce33f5f26f71859ad90431",
+ "sha256:046e83a8b160aff37e7034139a336b660b01dbfe58706f9d73f5cdc6b3460242",
+ "sha256:113309e819634f499d0006f6200700c8209a2a8bf6bd1bdc863a4d9d6776a5d1",
+ "sha256:200539b5124bc4721247a823a47d116a7a23e62cc6695744e3eb5454a8888e6d",
+ "sha256:25f4ce26b68425b80a233ce7b6218743c71cf7297dbe02feab1d711a2bf90045",
+ "sha256:269f0c5ff23639316b29f31df199f401e4cb87529eafff0c76828071635d417b",
+ "sha256:5de40649d4f88a15c9489ed37f88f053c15400257eeb18425ac7ed0a4e119400",
+ "sha256:7f78f963e62a61e294adb6ff5db901b629ef78cb2a1cfce3cf4eeba80c1c67aa",
+ "sha256:82469a0c1330a4beb3d42568f82dffa32226ced006e0b063719468dcd40ffdf0",
+ "sha256:8c2b7fa4d72781577ac45ab658da44c7518e6d96e2a50d04ecb0fd8f28b21d69",
+ "sha256:974535648f31c2b712a6b2595969f8ab370834080e00ab24e5dbb9d19b8bfb74",
+ "sha256:99272d6b6a68c7ae4391908fc15f6b8c9a6c345a46b632d7fdb7ef6c883a2bbb",
+ "sha256:9b64a4cc825ec4df262050c17e18f60252cdd94742b4ba1286bcfe481f1c0f26",
+ "sha256:9e9624440d754733eddbcd4614378c18713d2d9d0dc647cf9c72f64e39671be5",
+ "sha256:9ff16d994309b26a1cdf666a6309c1ef51ad4f72f99d3392bcd7b7139577a1f2",
+ "sha256:b33ebcd0222c1d77e61dbcd04a9fd139359bded86803063d3d2d197b796c63ce",
+ "sha256:bba52d72e16a554d1894a0cc74041da50eea99a8483e591a9edf1025a66843ab",
+ "sha256:bed7986547ce54d230fd8721aba6fd19459cdc6d315497b98686d0416efaff4e",
+ "sha256:c7f58a0e0e13fb44623b65b01052dae8e820ed9b8b654bb6296bc9c41f571b70",
+ "sha256:d58a4fa7910102500722defbde6e2816b0372a4fcc85c7e239323767c74f5cbc",
+ "sha256:f1ac2dc65105a53c1c2d72b1d3e98c2464a133b4067a51a3d2477b28449709a0"
+ ],
+ "version": "==2020.2.20"
},
"requests": {
"hashes": [
- "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
- "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
+ "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
],
"index": "pypi",
- "version": "==2.22.0"
+ "version": "==2.23.0"
+ },
+ "requests-file": {
+ "hashes": [
+ "sha256:75c175eed739270aec3c5279ffd74e6527dada275c5c0d76b5817e9c86bb7dea",
+ "sha256:8f04aa6201bacda0567e7ac7f677f1499b0fc76b22140c54bc06edf1ba92e2fa"
+ ],
+ "version": "==1.4.3"
},
"requests-oauthlib": {
"hashes": [
"sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d",
- "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a"
+ "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a",
+ "sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc"
],
"version": "==1.3.0"
},
"sickle": {
"hashes": [
- "sha256:76d66ed4607af2cd36ee15568a98e7f147d4ec3dd227bd047664a1ca88b21944",
- "sha256:b0aaa41d97a0c355aa6099b4dfa46c03f0bf828e6171960a15d68bd0548215ec"
+ "sha256:c0841b4df5edea33d2da01e893b3feadb1a8eacd2ca4dab9248e6047455ba14a",
+ "sha256:efdfa46c40cd34b3550f11b59c607ff0bd63adbc074efaa8b4dd662108269a2f"
],
"index": "pypi",
- "version": "==0.6.4"
+ "version": "==0.6.5"
},
"six": {
"hashes": [
- "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd",
- "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"
+ "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
+ "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
],
- "version": "==1.13.0"
+ "version": "==1.14.0"
},
"soupsieve": {
"hashes": [
- "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5",
- "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda"
+ "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae",
+ "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69"
],
- "version": "==1.9.5"
+ "version": "==2.0"
},
"tabulate": {
"hashes": [
@@ -607,6 +616,14 @@
],
"version": "==0.8.6"
},
+ "tldextract": {
+ "hashes": [
+ "sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7",
+ "sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808"
+ ],
+ "index": "pypi",
+ "version": "==2.2.2"
+ },
"tzlocal": {
"hashes": [
"sha256:11c9f16e0a633b4b60e1eede97d8a46340d042e67b670b290ca526576e039048",
@@ -616,10 +633,10 @@
},
"urllib3": {
"hashes": [
- "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293",
- "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"
+ "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc",
+ "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"
],
- "version": "==1.25.7"
+ "version": "==1.25.8"
},
"wcwidth": {
"hashes": [
@@ -630,10 +647,10 @@
},
"werkzeug": {
"hashes": [
- "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7",
- "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4"
+ "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096",
+ "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16"
],
- "version": "==0.16.0"
+ "version": "==1.0.0"
},
"wtforms": {
"hashes": [
@@ -642,6 +659,14 @@
],
"index": "pypi",
"version": "==2.2.1"
+ },
+ "zipp": {
+ "hashes": [
+ "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1",
+ "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"
+ ],
+ "index": "pypi",
+ "version": "==1.2.0"
}
},
"develop": {
@@ -682,39 +707,39 @@
},
"coverage": {
"hashes": [
- "sha256:189aac76d6e0d7af15572c51892e7326ee451c076c5a50a9d266406cd6c49708",
- "sha256:1bf7ba2af1d373a1750888724f84cffdfc697738f29a353c98195f98fc011509",
- "sha256:1f4ee8e2e4243971618bc16fcc4478317405205f135e95226c2496e2a3b8dbbf",
- "sha256:225e79a5d485bc1642cb7ba02281419c633c216cdc6b26c26494ba959f09e69f",
- "sha256:23688ff75adfa8bfa2a67254d889f9bdf9302c27241d746e17547c42c732d3f4",
- "sha256:28f7f73b34a05e23758e860a89a7f649b85c6749e252eff60ebb05532d180e86",
- "sha256:2d0cb9b1fe6ad0d915d45ad3d87f03a38e979093a98597e755930db1f897afae",
- "sha256:47874b4711c5aeb295c31b228a758ce3d096be83dc37bd56da48ed99efb8813b",
- "sha256:511ec0c00840e12fb4e852e4db58fa6a01ca4da72f36a9766fae344c3d502033",
- "sha256:53e7438fef0c97bc248f88ba1edd10268cd94d5609970aaf87abbe493691af87",
- "sha256:569f9ee3025682afda6e9b0f5bb14897c0db03f1a1dc088b083dd36e743f92bb",
- "sha256:593853aa1ac6dcc6405324d877544c596c9d948ef20d2e9512a0f5d2d3202356",
- "sha256:5b0a07158360d22492f9abd02a0f2ee7981b33f0646bf796598b7673f6bbab14",
- "sha256:7ca3db38a61f3655a2613ee2c190d63639215a7a736d3c64cc7bbdb002ce6310",
- "sha256:7d1cc7acc9ce55179616cf72154f9e648136ea55987edf84addbcd9886ffeba2",
- "sha256:88b51153657612aea68fa684a5b88037597925260392b7bb4509d4f9b0bdd889",
- "sha256:955ec084f549128fa2702f0b2dc696392001d986b71acd8fd47424f28289a9c3",
- "sha256:b251c7092cbb6d789d62dc9c9e7c4fb448c9138b51285c36aeb72462cad3600e",
- "sha256:bd82b684bb498c60ef47bb1541a50e6d006dde8579934dcbdbc61d67d1ea70d9",
- "sha256:bfe102659e2ec13b86c7f3b1db6c9a4e7beea4255058d006351339e6b342d5d2",
- "sha256:c1e4e39e43057396a5e9d069bfbb6ffeee892e40c5d2effbd8cd71f34ee66c4d",
- "sha256:cb2b74c123f65e8166f7e1265829a6c8ed755c3cd16d7f50e75a83456a5f3fd7",
- "sha256:cca38ded59105f7705ef6ffe1e960b8db6c7d8279c1e71654a4775ab4454ca15",
- "sha256:cf908840896f7aa62d0ec693beb53264b154f972eb8226fb864ac38975590c4f",
- "sha256:d095a7b473f8a95f7efe821f92058c8a2ecfb18f8db6677ae3819e15dc11aaae",
- "sha256:d22b4297e7e4225ccf01f1aa55e7a96412ea0796b532dd614c3fcbafa341128e",
- "sha256:d4a2b578a7a70e0c71f662705262f87a456f1e6c1e40ada7ea699abaf070a76d",
- "sha256:ddeb42a3d5419434742bf4cc71c9eaa22df3b76808e23a82bd0b0bd360f1a9f1",
- "sha256:e65a5aa1670db6263f19fdc03daee1d7dbbadb5cb67fd0a1f16033659db13c1d",
- "sha256:eaad65bd20955131bcdb3967a4dea66b4e4d4ca488efed7c00d91ee0173387e8",
- "sha256:f45fba420b94165c17896861bb0e8b27fb7abdcedfeb154895d8553df90b7b00"
- ],
- "version": "==5.0.2"
+ "sha256:15cf13a6896048d6d947bf7d222f36e4809ab926894beb748fc9caa14605d9c3",
+ "sha256:1daa3eceed220f9fdb80d5ff950dd95112cd27f70d004c7918ca6dfc6c47054c",
+ "sha256:1e44a022500d944d42f94df76727ba3fc0a5c0b672c358b61067abb88caee7a0",
+ "sha256:25dbf1110d70bab68a74b4b9d74f30e99b177cde3388e07cc7272f2168bd1477",
+ "sha256:3230d1003eec018ad4a472d254991e34241e0bbd513e97a29727c7c2f637bd2a",
+ "sha256:3dbb72eaeea5763676a1a1efd9b427a048c97c39ed92e13336e726117d0b72bf",
+ "sha256:5012d3b8d5a500834783689a5d2292fe06ec75dc86ee1ccdad04b6f5bf231691",
+ "sha256:51bc7710b13a2ae0c726f69756cf7ffd4362f4ac36546e243136187cfcc8aa73",
+ "sha256:527b4f316e6bf7755082a783726da20671a0cc388b786a64417780b90565b987",
+ "sha256:722e4557c8039aad9592c6a4213db75da08c2cd9945320220634f637251c3894",
+ "sha256:76e2057e8ffba5472fd28a3a010431fd9e928885ff480cb278877c6e9943cc2e",
+ "sha256:77afca04240c40450c331fa796b3eab6f1e15c5ecf8bf2b8bee9706cd5452fef",
+ "sha256:7afad9835e7a651d3551eab18cbc0fdb888f0a6136169fbef0662d9cdc9987cf",
+ "sha256:9bea19ac2f08672636350f203db89382121c9c2ade85d945953ef3c8cf9d2a68",
+ "sha256:a8b8ac7876bc3598e43e2603f772d2353d9931709345ad6c1149009fd1bc81b8",
+ "sha256:b0840b45187699affd4c6588286d429cd79a99d509fe3de0f209594669bb0954",
+ "sha256:b26aaf69713e5674efbde4d728fb7124e429c9466aeaf5f4a7e9e699b12c9fe2",
+ "sha256:b63dd43f455ba878e5e9f80ba4f748c0a2156dde6e0e6e690310e24d6e8caf40",
+ "sha256:be18f4ae5a9e46edae3f329de2191747966a34a3d93046dbdf897319923923bc",
+ "sha256:c312e57847db2526bc92b9bfa78266bfbaabac3fdcd751df4d062cd4c23e46dc",
+ "sha256:c60097190fe9dc2b329a0eb03393e2e0829156a589bd732e70794c0dd804258e",
+ "sha256:c62a2143e1313944bf4a5ab34fd3b4be15367a02e9478b0ce800cb510e3bbb9d",
+ "sha256:cc1109f54a14d940b8512ee9f1c3975c181bbb200306c6d8b87d93376538782f",
+ "sha256:cd60f507c125ac0ad83f05803063bed27e50fa903b9c2cfee3f8a6867ca600fc",
+ "sha256:d513cc3db248e566e07a0da99c230aca3556d9b09ed02f420664e2da97eac301",
+ "sha256:d649dc0bcace6fcdb446ae02b98798a856593b19b637c1b9af8edadf2b150bea",
+ "sha256:d7008a6796095a79544f4da1ee49418901961c97ca9e9d44904205ff7d6aa8cb",
+ "sha256:da93027835164b8223e8e5af2cf902a4c80ed93cb0909417234f4a9df3bcd9af",
+ "sha256:e69215621707119c6baf99bda014a45b999d37602cb7043d943c76a59b05bf52",
+ "sha256:ea9525e0fef2de9208250d6c5aeeee0138921057cd67fcef90fbed49c4d62d37",
+ "sha256:fca1669d464f0c9831fd10be2eef6b86f5ebd76c724d1e0706ebdff86bb4adf0"
+ ],
+ "version": "==5.0.3"
},
"decorator": {
"hashes": [
@@ -725,18 +750,18 @@
},
"idna": {
"hashes": [
- "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
- "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
+ "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb",
+ "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"
],
- "version": "==2.8"
+ "version": "==2.9"
},
"importlib-metadata": {
"hashes": [
- "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45",
- "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"
+ "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302",
+ "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b"
],
"markers": "python_version < '3.8'",
- "version": "==1.3.0"
+ "version": "==1.5.0"
},
"ipython": {
"hashes": [
@@ -762,10 +787,10 @@
},
"jedi": {
"hashes": [
- "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064",
- "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807"
+ "sha256:b4f4052551025c6b0b0b193b29a6ff7bdb74c52450631206c262aef9f7159ad2",
+ "sha256:d5c871cb9360b414f981e7072c52c33258d598305280fef91c6cae34739d65d5"
],
- "version": "==0.15.2"
+ "version": "==0.16.0"
},
"lazy-object-proxy": {
"hashes": [
@@ -802,24 +827,24 @@
},
"more-itertools": {
"hashes": [
- "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d",
- "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564"
+ "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c",
+ "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507"
],
- "version": "==8.0.2"
+ "version": "==8.2.0"
},
"packaging": {
"hashes": [
- "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb",
- "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8"
+ "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73",
+ "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334"
],
- "version": "==20.0"
+ "version": "==20.1"
},
"parso": {
"hashes": [
- "sha256:55cf25df1a35fd88b878715874d2c4dc1ad3f0eebd1e0266a67e1f55efccfbe1",
- "sha256:5c1f7791de6bd5dbbeac8db0ef5594b36799de198b3f7f7014643b0c5536b9d3"
+ "sha256:0c5659e0c6eba20636f99a04f469798dca8da279645ce5c387315b2c23912157",
+ "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995"
],
- "version": "==0.5.2"
+ "version": "==0.6.2"
},
"pathlib2": {
"hashes": [
@@ -831,11 +856,11 @@
},
"pexpect": {
"hashes": [
- "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
- "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+ "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937",
+ "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"
],
"markers": "sys_platform != 'win32'",
- "version": "==4.7.0"
+ "version": "==4.8.0"
},
"pg-view": {
"hashes": [
@@ -922,11 +947,11 @@
},
"pytest": {
"hashes": [
- "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa",
- "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4"
+ "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d",
+ "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"
],
"index": "pypi",
- "version": "==5.3.2"
+ "version": "==5.3.5"
},
"pytest-cov": {
"hashes": [
@@ -946,12 +971,11 @@
},
"pytest-pylint": {
"hashes": [
- "sha256:8c38ea779e540e27ec4378b0820d906006e09f4ac834defbd886abbf57c7d2ec",
- "sha256:a4f5e5007f88c2095dcac799e9f7eed3d7e7a2e657596e26093814980ff5fa20",
- "sha256:a574c246535308f8f6ceac10fa82f8fffffa837071f7985b22515895185700c1"
+ "sha256:3996a55ba66ce8ddf150754d8549567a4b067d63fa4513fdfd3325c7553c8075",
+ "sha256:b3f83f4525b2cbd019e9e46b4ee9c4ccee82bde66edf9872690ccfdc75456703"
],
"index": "pypi",
- "version": "==0.14.1"
+ "version": "==0.15.0"
},
"pytest-pythonpath": {
"hashes": [
@@ -962,19 +986,19 @@
},
"requests": {
"hashes": [
- "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
- "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
+ "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
],
"index": "pypi",
- "version": "==2.22.0"
+ "version": "==2.23.0"
},
"responses": {
"hashes": [
- "sha256:515fd7c024097e5da76e9c4cf719083d181f1c3ddc09c2e0e49284ce863dd263",
- "sha256:8ce8cb4e7e1ad89336f8865af152e0563d2e7f0e0b86d2cf75f015f819409243"
+ "sha256:33d8994990de31a9fbb8c889c3a378dac296b821f37330c845608ae565e7fca3",
+ "sha256:859a7f8495a069ce835620bb4bb111688f593a6f8873a3335a99c31a5aef9a9a"
],
"index": "pypi",
- "version": "==0.10.9"
+ "version": "==0.10.11"
},
"simplegeneric": {
"hashes": [
@@ -984,10 +1008,10 @@
},
"six": {
"hashes": [
- "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd",
- "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"
+ "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
+ "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
],
- "version": "==1.13.0"
+ "version": "==1.14.0"
},
"traitlets": {
"hashes": [
@@ -998,36 +1022,37 @@
},
"typed-ast": {
"hashes": [
- "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161",
- "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e",
- "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e",
- "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0",
- "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c",
- "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47",
- "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631",
- "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4",
- "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34",
- "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b",
- "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2",
- "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e",
- "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a",
- "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233",
- "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1",
- "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36",
- "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d",
- "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a",
- "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66",
- "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12"
+ "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
+ "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
+ "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
+ "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
+ "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
+ "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
+ "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
+ "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
+ "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
+ "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
+ "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
+ "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
+ "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
+ "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
+ "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
+ "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
+ "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
+ "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
+ "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
+ "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
+ "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
],
"markers": "implementation_name == 'cpython' and python_version < '3.8'",
- "version": "==1.4.0"
+ "version": "==1.4.1"
},
"urllib3": {
"hashes": [
- "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293",
- "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"
+ "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc",
+ "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"
],
- "version": "==1.25.7"
+ "version": "==1.25.8"
},
"wcwidth": {
"hashes": [
@@ -1044,10 +1069,11 @@
},
"zipp": {
"hashes": [
- "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e",
- "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335"
+ "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1",
+ "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"
],
- "version": "==0.6.0"
+ "index": "pypi",
+ "version": "==1.2.0"
}
}
}
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index bb715fb6..e1e06653 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -166,6 +166,11 @@ def run_grobid_metadata(args):
bezerk_mode=args.bezerk_mode)
LinePusher(fmi, args.tsv_file).run()
+def run_shadow_lib(args):
+ fmi = ShadowLibraryImporter(args.api,
+ edit_batch_size=100)
+ JsonLinePusher(fmi, args.json_file).run()
+
def run_wayback_static(args):
api = args.api
@@ -475,6 +480,16 @@ def main():
action='store_true',
help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
+ sub_shadow_lib = subparsers.add_parser('shadow-lib',
+ help="create release and file entities based on GROBID PDF metadata extraction")
+ sub_shadow_lib.set_defaults(
+ func=run_shadow_lib,
+ auth_var="FATCAT_AUTH_WORKER_SHADOW",
+ )
+ sub_shadow_lib.add_argument('json_file',
+ help="JSON file to import from (or stdin)",
+ default=sys.stdin, type=argparse.FileType('r'))
+
sub_wayback_static = subparsers.add_parser('wayback-static',
help="crude crawl+ingest tool for single-page HTML docs from wayback")
sub_wayback_static.set_defaults(
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 6fda74c5..9ba95015 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -90,6 +90,8 @@ def _run_search_dump(args, search):
)
if not ingest_request:
continue
+ if args.force_recrawl:
+ ingest_request['force_recrawl'] = True
counts['ingest_request'] += 1
if args.dry_run:
continue
@@ -206,6 +208,9 @@ def main():
parser.add_argument('--allow-non-oa',
action='store_true',
help="By default, we limit to OA releases. This removes that filter")
+ parser.add_argument('--force-recrawl',
+ action='store_true',
+ help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl")
subparsers = parser.add_subparsers()
sub_container = subparsers.add_parser('container',
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 03c7cbcc..c26446fd 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
from .wayback_static import auto_wayback_static
from .cdl_dash_dat import auto_cdl_dash_dat
from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
+from .shadow import ShadowLibraryImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 5f5c46b8..c000ad62 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -194,6 +194,8 @@ DOMAIN_REL_MAP = {
"www.scielo.cl": "repository",
"www.scielo.org.mx": "repository",
"zenodo.org": "repository",
+ "www.biorxiv.org": "repository",
+ "www.medrxiv.org": "repository",
"citeseerx.ist.psu.edu": "aggregator",
"publisher-connector.core.ac.uk": "aggregator",
@@ -220,6 +222,13 @@ DOMAIN_REL_MAP = {
"www.nature.com": "publisher",
"www.pnas.org": "publisher",
"www.tandfonline.com": "publisher",
+ "www.frontiersin.org": "publisher",
+ "www.degruyter.com": "publisher",
+ "www.mdpi.com": "publisher",
+ "www.ahajournals.org": "publisher",
+ "ehp.niehs.nih.gov": "publisher",
+ "journals.tsu.ru": "publisher",
+ "www.cogentoa.com": "publisher",
"www.researchgate.net": "academicsocial",
"academia.edu": "academicsocial",
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index fdaba176..4772bfaa 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -32,8 +32,11 @@ class IngestFileResultImporter(EntityImporter):
'fatcat-ingest',
'arabesque',
'mag-corpus',
+ 'mag',
'unpaywall-corpus',
+ 'unpaywall',
's2-corpus',
+ 's2',
]
if kwargs.get('skip_source_whitelist', False):
self.ingest_request_source_whitelist = []
@@ -137,7 +140,12 @@ class IngestFileResultImporter(EntityImporter):
if not 'terminal_dt' in terminal:
terminal['terminal_dt'] = terminal['dt']
assert len(terminal['terminal_dt']) == 14
- url = make_rel_url(terminal['terminal_url'], self.default_link_rel)
+
+ default_rel = self.default_link_rel
+ if request.get('link_source') == 'doi':
+ default_rel = 'publisher'
+ default_rel = request.get('rel', default_rel)
+ url = make_rel_url(terminal['terminal_url'], default_rel)
if not url:
self.counts['skip-url'] += 1
@@ -158,8 +166,8 @@ class IngestFileResultImporter(EntityImporter):
release_ids=[release_ident],
urls=urls,
)
- if fatcat and fatcat.get('edit_extra'):
- fe.edit_extra = fatcat['edit_extra']
+ if request.get('edit_extra'):
+ fe.edit_extra = request['edit_extra']
else:
fe.edit_extra = dict()
if request.get('ingest_request_source'):
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
new file mode 100644
index 00000000..4cd22775
--- /dev/null
+++ b/python/fatcat_tools/importers/shadow.py
@@ -0,0 +1,195 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_openapi_client
+
+from fatcat_tools.normal import *
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
+
+
+class ShadowLibraryImporter(EntityImporter):
+ """
+ Importer for shadow library files (matched to releases)
+
+ Input format is JSON with keys:
+ - shadow
+ - shadow_corpus (string slug)
+ - shadow_id (string)
+ - doi
+ - pmid
+ - isbn13
+ - file_meta
+ - sha1hex
+ - sha256hex
+ - md5hex
+ - size_bytes
+ - mimetype
+ - cdx (may be null)
+ - url
+ - datetime
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
+ eg_extra = kwargs.pop('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+ self.default_link_rel = kwargs.get("default_link_rel", "web")
+
+ def want(self, raw_record):
+ """
+ Only want to import records with complete file-level metadata
+ """
+ fm = raw_record['file_meta']
+ if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
+ self.counts['skip-file-meta-incomplete'] += 1
+ return False
+ if fm['mimetype'] != 'application/pdf':
+ self.counts['skip-not-pdf'] += 1
+ return False
+ return True
+
+ def parse_record(self, obj):
+ """
+ We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
+ """
+
+ shadow_corpus = obj['shadow']['shadow_corpus']
+ assert shadow_corpus == shadow_corpus.strip().lower()
+ doi = clean_doi(obj['shadow'].get('doi'))
+ pmid = clean_pmid(obj['shadow'].get('pmid'))
+ isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
+ shadow_id = obj['shadow'].get('shadow_id').strip()
+ assert shadow_id
+
+ extra = { '{}_id'.format(shadow_corpus): shadow_id }
+ for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ if not ext_id:
+ continue
+ extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+
+ # lookup release via several idents
+ re = None
+ for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ if not ext_id:
+ continue
+ try:
+ re = self.api.lookup_release(**{ext_type: ext_id})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status not in (404, 400):
+ raise err
+ re = None
+ if re:
+ break
+
+ if not re:
+ self.counts['skip-release-not-found'] += 1
+ return None
+
+ release_ids = [re.ident,]
+
+ # parse single CDX into URLs (if exists)
+ urls = []
+ if obj.get('cdx'):
+ url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+ if url != None:
+ urls.append(url)
+ wayback = "https://web.archive.org/web/{}/{}".format(
+ obj['cdx']['datetime'],
+ obj['cdx']['url'])
+ urls.append(("webarchive", wayback))
+ urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+
+ fe = fatcat_openapi_client.FileEntity(
+ md5=obj['file_meta']['md5hex'],
+ sha1=obj['file_meta']['sha1hex'],
+ sha256=obj['file_meta']['sha256hex'],
+ size=int(obj['file_meta']['size_bytes']),
+ mimetype=obj['file_meta']['mimetype'] or None,
+ release_ids=release_ids,
+ urls=urls,
+ extra=dict(shadows=extra),
+ )
+ return fe
+
+ def try_update(self, fe):
+ # lookup sha1, or create new entity
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ return True
+
+ if not existing.extra:
+ existing.extra = {}
+
+ if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+ # already imported from this shadow library; skip
+ self.counts['exists'] += 1
+ return False
+
+ # check for edit conflicts
+ if existing.ident in [e.ident for e in self._edits_inflight]:
+ self.counts['skip-update-inflight'] += 1
+ return False
+ if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
+ raise Exception("Inflight insert; shouldn't happen")
+
+ # minimum viable "existing" URL cleanup to fix dupes and broken links:
+ # remove 'None' wayback URLs, and set archive.org rel 'archive'
+ existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ for i in range(len(existing.urls)):
+ u = existing.urls[i]
+ if u.rel == 'repository' and '://archive.org/download/' in u.url:
+ existing.urls[i].rel = 'archive'
+ if u.rel == 'social':
+ u.rel = 'academicsocial'
+
+ # merge the existing into this one and update
+ merged_urls = {}
+ for u in fe.urls + existing.urls:
+ merged_urls[u.url] = u
+ existing.urls = list(merged_urls.values())
+ if not existing.extra.get('shadows'):
+ existing.extra['shadows'] = fe.extra['shadows']
+ else:
+ existing.extra['shadows'].update(fe.extra['shadows'])
+
+ # do these "plus ones" because we really want to do these updates when possible
+ if len(existing.urls) > SANE_MAX_URLS + 1:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
+ existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
+ self.counts['skip-update-too-many-releases'] += 1
+ return None
+ existing.mimetype = existing.mimetype or fe.mimetype
+ existing.size = existing.size or fe.size
+ existing.md5 = existing.md5 or fe.md5
+ existing.sha1 = existing.sha1 or fe.sha1
+ existing.sha256 = existing.sha256 or fe.sha256
+ edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+ # add sha1 to non-entity edit row, so we can do more aggressive
+ # group-level de-dupe
+ edit.sha1 = existing.sha1
+ self._edits_inflight.append(edit)
+ self.counts['update'] += 1
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py
index 6a4b1bba..3f4700ff 100644
--- a/python/fatcat_tools/transforms/__init__.py
+++ b/python/fatcat_tools/transforms/__init__.py
@@ -1,5 +1,5 @@
from .entities import entity_to_dict, entity_from_json, entity_from_dict
-from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch
+from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch, file_to_elasticsearch
from .csl import release_to_csl, citeproc_csl
from .ingest import release_ingest_request
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 3a53db4d..87e054ec 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,6 +1,6 @@
-
import collections
+import tldextract
from fatcat_openapi_client import ApiClient
@@ -20,6 +20,7 @@ def test_check_kbart():
assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False
assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True
+
def release_to_elasticsearch(entity, force_bool=True):
"""
Converts from an entity model/schema to elasticsearch oriented schema.
@@ -50,6 +51,10 @@ def release_to_elasticsearch(entity, force_bool=True):
release_stage = release.release_stage,
withdrawn_status = release.withdrawn_status,
language = release.language,
+ volume = release.volume,
+ issue = release.issue,
+ pages = release.pages,
+ number = release.number,
license = release.license_slug,
doi = release.ext_ids.doi,
pmid = release.ext_ids.pmid,
@@ -72,7 +77,7 @@ def release_to_elasticsearch(entity, force_bool=True):
in_dweb = False
in_ia = False
in_ia_sim = False
- in_shadow = False
+ in_shadows = False
release_year = release.release_year
if release.release_date:
@@ -85,11 +90,15 @@ def release_to_elasticsearch(entity, force_bool=True):
t['any_abstract'] = len(release.abstracts or []) > 0
t['ref_count'] = len(release.refs or [])
- t['ref_linked_count'] = 0
- if release.refs:
- t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id])
+ ref_release_ids = []
+ for r in (release.refs or []):
+ if r.target_release_id:
+ ref_release_ids.append(r.target_release_id)
+ t['ref_release_ids'] = ref_release_ids
+ t['ref_linked_count'] = len(ref_release_ids)
t['contrib_count'] = len(release.contribs or [])
contrib_names = []
+ contrib_affiliations = []
creator_ids = []
for c in (release.contribs or []):
if c.raw_name:
@@ -98,8 +107,14 @@ def release_to_elasticsearch(entity, force_bool=True):
contrib_names.append(c.surname)
if c.creator_id:
creator_ids.append(c.creator_id)
+ if c.raw_affiliation:
+ contrib_affiliations.append(c.raw_affiliation)
t['contrib_names'] = contrib_names
t['creator_ids'] = creator_ids
+ t['affiliations'] = contrib_affiliations
+
+ # TODO: mapping... probably by lookup?
+ t['affiliation_rors'] = None
container = release.container
if container:
@@ -134,14 +149,19 @@ def release_to_elasticsearch(entity, force_bool=True):
if c_extra.get('road'):
if c_extra['road'].get('as_of'):
is_oa = True
- if c_extra.get('ezb'):
- if c_extra['ezb'].get('color') == 'green':
- is_oa = True
if c_extra.get('szczepanski'):
if c_extra['szczepanski'].get('as_of'):
is_oa = True
- else:
+ if c_extra.get('country'):
+ t['country_code'] = c_extra['country']
+ t['country_code_upper'] = c_extra['country'].upper()
+
+ # fall back to release-level container metadata if container not linked or
+ # missing context
+ if not t.get('publisher'):
t['publisher'] = release.publisher
+ if not t.get('container_name') and release.extra:
+ t['container_name'] = release.extra.get('container_name')
if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
in_jstor = True
@@ -187,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True):
# TODO: more/better checks here, particularly strict *not* OA licenses
if release.license_slug.startswith("CC-"):
is_oa = True
+ if release.license_slug.startswith("ARXIV-"):
+ is_oa = True
extra = release.extra or dict()
if extra:
@@ -203,6 +225,47 @@ def release_to_elasticsearch(entity, force_bool=True):
if extra['crossref'].get('archive'):
# all crossref archives are KBART, I believe
in_kbart = True
+ # backwards compatible subtitle fetching
+ if not t['subtitle'] and extra.get('subtitle'):
+ if type(extra['subtitle']) == list:
+ t['subtitle'] = extra['subtitle'][0]
+ else:
+ t['subtitle'] = extra['subtitle']
+
+ t['first_page'] = None
+ if release.pages:
+ first = release.pages.split('-')[0]
+ first = first.replace('p', '')
+ if first.isdigit():
+ t['first_page'] = first
+ # TODO: non-numerical first pages
+
+ t['ia_microfilm_url'] = None
+ if in_ia_sim:
+ # TODO: determine URL somehow? I think this is in flux. Will probably
+ # need extra metadata in the container extra field.
+ # special case as a demo for now.
+ if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \
+ and release.release_year in (2011, 2013) \
+ and release.issue \
+ and release.issue.isdigit() \
+ and t['first_page']:
+ t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
+ release.release_year,
+ int(release.issue) - 1,
+ t['first_page'],
+ )
+
+ t['doi_registrar'] = None
+ if extra and t['doi']:
+ for k in ('crossref', 'datacite', 'jalc'):
+ if k in extra:
+ t['doi_registrar'] = k
+ if not 'doi_registrar' in t:
+ t['doi_registrar'] = 'crossref'
+
+ if t['doi']:
+ t['doi_prefix'] = t['doi'].split('/')[0]
if is_longtail_oa:
is_oa = True
@@ -215,6 +278,7 @@ def release_to_elasticsearch(entity, force_bool=True):
t['in_jstor'] = bool(in_jstor)
t['in_web'] = bool(in_web)
t['in_dweb'] = bool(in_dweb)
+ t['in_shadows'] = bool(in_shadows)
else:
t['is_oa'] = is_oa
t['is_longtail_oa'] = is_longtail_oa
@@ -223,11 +287,23 @@ def release_to_elasticsearch(entity, force_bool=True):
t['in_jstor'] = in_jstor
t['in_web'] = in_web
t['in_dweb'] = in_dweb
+ t['in_shadows'] = in_shadows
t['in_ia'] = bool(in_ia)
t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
+
+ if in_ia or t.get('pmcid') or t.get('arxiv_id'):
+ t['preservation'] = 'bright'
+ elif in_kbart or in_jstor:
+ t['preservation'] = 'dark'
+ elif in_shadows:
+ t['preservation'] = 'shadows_only'
+ else:
+ t['preservation'] = 'none'
+
return t
+
def container_to_elasticsearch(entity, force_bool=True):
"""
Converts from an entity model/schema to elasticsearch oriented schema.
@@ -257,23 +333,27 @@ def container_to_elasticsearch(entity, force_bool=True):
wikidata_qid = entity.wikidata_qid,
)
- # TODO: region, discipline
- # TODO: single primary language?
if not entity.extra:
entity.extra = dict()
- for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):
+ for key in ('country', 'languages', 'mimetypes', 'original_name',
+ 'first_year', 'last_year', 'aliases', 'abbrev', 'region',
+ 'discipline'):
if entity.extra.get(key):
t[key] = entity.extra[key]
+ if 'country' in t:
+ t['country_code'] = t.pop('country')
+
+ t['issns'] = []
+ if entity.issnl:
+ t['issns'].append(entity.issnl)
+ for key in ('issnp', 'issne'):
+ if entity.extra.get(key):
+ t['issns'].append(entity.extra[key])
+
in_doaj = None
in_road = None
- # TODO: not currently implemented
- in_doi = None
- # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid"
- #in_doaj_works = None
- in_sherpa_romeo = None
is_oa = None
- # TODO: not actually set/stored anywhere?
is_longtail_oa = None
any_kbart = None
any_jstor = None
@@ -286,17 +366,15 @@ def container_to_elasticsearch(entity, force_bool=True):
if extra.get('road'):
if extra['road'].get('as_of'):
in_road = True
- if extra.get('ezb'):
- if extra['ezb'].get('color') == 'green':
- is_oa = True
if extra.get('szczepanski'):
if extra['szczepanski'].get('as_of'):
is_oa = True
if extra.get('default_license'):
if extra['default_license'].startswith('CC-'):
is_oa = True
+ t['sherpa_romeo_color'] = None
if extra.get('sherpa_romeo'):
- in_sherpa_romeo = True
+ t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color')
if extra['sherpa_romeo'].get('color') == 'white':
is_oa = False
if extra.get('kbart'):
@@ -306,54 +384,128 @@ def container_to_elasticsearch(entity, force_bool=True):
if extra.get('ia'):
if extra['ia'].get('sim'):
any_ia_sim = True
+ if extra['ia'].get('longtail_oa'):
+ is_longtail_oa = True
t['is_superceded'] = bool(extra.get('superceded'))
t['in_doaj'] = bool(in_doaj)
t['in_road'] = bool(in_road)
- t['in_sherpa_romeo'] = bool(in_sherpa_romeo)
t['any_kbart'] = bool(any_kbart)
- t['is_longtail_oa'] = bool(is_longtail_oa)
if force_bool:
- t['in_doi'] = bool(in_doi)
- t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa)
+ t['is_oa'] = bool(in_doaj or in_road or is_oa)
+ t['is_longtail_oa'] = bool(is_longtail_oa)
t['any_jstor'] = bool(any_jstor)
t['any_ia_sim'] = bool(any_ia_sim)
else:
- t['in_doi'] = in_doi
- t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa
+ t['is_oa'] = in_doaj or in_road or is_oa
+ t['is_longtail_oa'] = is_longtail_oa
t['any_jstor'] = any_jstor
t['any_ia_sim'] = any_ia_sim
return t
+def _type_of_edit(edit):
+ if edit.revision == None and edit.redirect_ident == None:
+ return 'delete'
+ elif edit.redirect_ident:
+ # redirect
+ return 'update'
+ elif edit.prev_revision == None and edit.redirect_ident == None and edit.revision:
+ return 'create'
+ else:
+ return 'update'
+
+
def changelog_to_elasticsearch(entity):
+ """
+ Note that this importer requires expanded fill info to work. Calling code
+ may need to re-fetch editgroup from API to get the 'editor' field. Some of
+ the old kafka feed content doesn't includes editor in particular.
+ """
editgroup = entity.editgroup
t = dict(
index=entity.index,
editgroup_id=entity.editgroup_id,
- timestamp=entity.timestamp,
+ timestamp=entity.timestamp.isoformat(),
editor_id=editgroup.editor_id,
+ username=editgroup.editor.username,
+ is_bot=editgroup.editor.is_bot,
+ is_admin=editgroup.editor.is_admin,
)
extra = editgroup.extra or dict()
if extra.get('agent'):
t['agent'] = extra['agent']
- t['containers'] = len(editgroup.edits.containers)
- t['creators'] = len(editgroup.edits.containers)
- t['files'] = len(editgroup.edits.containers)
- t['filesets'] = len(editgroup.edits.containers)
- t['webcaptures'] = len(editgroup.edits.containers)
- t['releases'] = len(editgroup.edits.containers)
- t['works'] = len(editgroup.edits.containers)
-
- # TODO: parse and pull out counts
- #created = 0
- #updated = 0
- #deleted = 0
- #t['created'] = created
- #t['updated'] = updated
- #t['deleted'] = deleted
- #t['total'] = created + updated + deleted
+ containers = [_type_of_edit(e) for e in editgroup.edits.containers]
+ creators = [_type_of_edit(e) for e in editgroup.edits.creators]
+ files = [_type_of_edit(e) for e in editgroup.edits.files]
+ filesets = [_type_of_edit(e) for e in editgroup.edits.filesets]
+ webcaptures = [_type_of_edit(e) for e in editgroup.edits.webcaptures]
+ releases = [_type_of_edit(e) for e in editgroup.edits.releases]
+ works = [_type_of_edit(e) for e in editgroup.edits.works]
+
+ t['containers'] = len(containers)
+ t['new_containers'] = len([e for e in containers if e == 'create'])
+ t['creators'] = len(creators)
+ t['new_creators'] = len([e for e in creators if e == 'create'])
+ t['files'] = len(files)
+ t['new_files'] = len([e for e in files if e == 'create'])
+ t['filesets'] = len(filesets)
+ t['new_filesets'] = len([e for e in filesets if e == 'create'])
+ t['webcaptures'] = len(webcaptures)
+ t['new_webcaptures'] = len([e for e in webcaptures if e == 'create'])
+ t['releases'] = len(releases)
+ t['new_releases'] = len([e for e in releases if e == 'create'])
+ t['works'] = len(works)
+ t['new_works'] = len([e for e in works if e == 'create'])
+
+ all_edits = containers + creators + files + filesets + webcaptures + releases + works
+
+ t['created'] = len([e for e in all_edits if e == 'create'])
+ t['updated'] = len([e for e in all_edits if e == 'update'])
+ t['deleted'] = len([e for e in all_edits if e == 'delete'])
+ t['total'] = len(all_edits)
+ return t
+
+
+def file_to_elasticsearch(entity):
+ """
+ Converts from an entity model/schema to elasticsearch oriented schema.
+
+ Returns: dict
+ Raises exception on error (never returns None)
+ """
+
+ if entity.state in ('redirect', 'deleted'):
+ return dict(
+ ident = entity.ident,
+ state = entity.state,
+ )
+ elif entity.state != 'active':
+ raise ValueError("Unhandled entity state: {}".format(entity.state))
+
+ # First, the easy ones (direct copy)
+ t = dict(
+ ident = entity.ident,
+ state = entity.state,
+ revision = entity.revision,
+ release_ids = entity.release_ids,
+ release_count = len(entity.release_ids),
+ mimetype = entity.mimetype,
+ size_bytes = entity.size,
+ sha1 = entity.sha1,
+ sha256 = entity.sha256,
+ md5 = entity.md5,
+ )
+
+ parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
+ t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls]))
+ t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
+ t['rels'] = list(set([u.rel for u in entity.urls]))
+
+ t['in_ia'] = bool('archive.org' in t['domains'])
+ t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+
return t
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 7a9a585d..b84d5e70 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -105,6 +105,8 @@ class EntityUpdatesWorker(FatcatWorker):
self.live_pdf_ingest_doi_prefix_acceptlist = [
# biorxiv and medrxiv
"10.1101/",
+ # researchgate
+ "10.13140/",
]
def want_live_ingest(self, release, ingest_request):
@@ -121,6 +123,33 @@ class EntityUpdatesWorker(FatcatWorker):
ingest_type = ingest_request.get('ingest_type')
doi = ingest_request.get('ext_ids', {}).get('doi')
+ is_document = release.release_type in (
+ 'article-journal',
+ 'paper-conference',
+ 'article',
+ 'report',
+ 'chapter',
+ 'manuscript',
+ 'review',
+ 'thesis',
+ 'letter',
+ 'editorial',
+ 'abstract',
+ 'entry',
+ 'patent',
+ 'post',
+ 'review-book',
+ )
+ is_not_pdf = release.release_type in (
+ 'dataset',
+ 'stub',
+ 'software',
+ 'figure',
+ 'graphic',
+ )
+
+ # accept list sets a default "crawl it" despite OA metadata for
+ # known-OA DOI prefixes
in_acceptlist = False
if doi:
for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
@@ -129,9 +158,18 @@ class EntityUpdatesWorker(FatcatWorker):
if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
es = release_to_elasticsearch(release)
- if not es['is_oa'] and not in_acceptlist:
+ # most datacite documents are in IRs and should be crawled
+ is_datacite_doc = False
+ if release.extra and ('datacite' in release.extra) and is_document:
+ is_datacite_doc = True
+ if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
return False
+ # if ingest_type is pdf but release_type is almost certainly not a PDF,
+ # skip it. This is mostly a datacite thing.
+ if ingest_type == "pdf" and is_not_pdf:
+ return False
+
if ingest_type == "pdf" and doi:
for prefix in self.ingest_pdf_doi_prefix_blocklist:
if doi.startswith(prefix):
diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py
index ccb13871..23a56109 100755
--- a/python/fatcat_transform.py
+++ b/python/fatcat_transform.py
@@ -1,6 +1,8 @@
#!/usr/bin/env python3
"""
+Utility script for doing bulk conversion/tranforms of entity JSON schema to
+other formats
"""
import sys
@@ -15,10 +17,11 @@ from citeproc_styles import get_style_filepath
import fatcat_openapi_client
from fatcat_openapi_client.rest import ApiException
-from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ChangelogEntry
+from fatcat_openapi_client import ReleaseEntity, ContainerEntity, FileEntity, ChangelogEntry
from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \
release_to_elasticsearch, container_to_elasticsearch, \
- changelog_to_elasticsearch, public_api, release_to_csl, citeproc_csl
+ file_to_elasticsearch, changelog_to_elasticsearch, public_api, \
+ release_to_csl, citeproc_csl
def run_elasticsearch_releases(args):
@@ -27,6 +30,8 @@ def run_elasticsearch_releases(args):
if not line:
continue
entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
+ if entity.state != 'active':
+ continue
args.json_output.write(
json.dumps(release_to_elasticsearch(entity)) + '\n')
@@ -36,9 +41,22 @@ def run_elasticsearch_containers(args):
if not line:
continue
entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client)
+ if entity.state != 'active':
+ continue
args.json_output.write(
json.dumps(container_to_elasticsearch(entity)) + '\n')
+def run_elasticsearch_files(args):
+ for line in args.json_input:
+ line = line.strip()
+ if not line:
+ continue
+ entity = entity_from_json(line, FileEntity, api_client=args.api.api_client)
+ if entity.state != 'active':
+ continue
+ args.json_output.write(
+ json.dumps(file_to_elasticsearch(entity)) + '\n')
+
def run_elasticsearch_changelogs(args):
for line in args.json_input:
line = line.strip()
@@ -54,6 +72,8 @@ def run_citeproc_releases(args):
if not line:
continue
entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
+ if entity.state != 'active':
+ continue
csl_json = release_to_csl(entity)
csl_json['id'] = "release:" + (entity.ident or "unknown")
out = citeproc_csl(csl_json, args.style, args.html)
@@ -87,6 +107,16 @@ def main():
help="where to send output",
default=sys.stdout, type=argparse.FileType('w'))
+ sub_elasticsearch_files = subparsers.add_parser('elasticsearch-files',
+ help="convert fatcat file JSON schema to elasticsearch file schema")
+ sub_elasticsearch_files.set_defaults(func=run_elasticsearch_files)
+ sub_elasticsearch_files.add_argument('json_input',
+ help="JSON-per-line of file entities",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_elasticsearch_files.add_argument('json_output',
+ help="where to send output",
+ default=sys.stdout, type=argparse.FileType('w'))
+
sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs',
help="convert fatcat changelog JSON schema to elasticsearch changelog schema")
sub_elasticsearch_changelogs.set_defaults(func=run_elasticsearch_changelogs)
diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py
index af0fea83..591dda80 100644
--- a/python/fatcat_web/entity_helpers.py
+++ b/python/fatcat_web/entity_helpers.py
@@ -53,6 +53,10 @@ def enrich_release_entity(entity):
entity._es = release_to_elasticsearch(entity, force_bool=False)
if entity.container and entity.container.state == "active":
entity.container._es = container_to_elasticsearch(entity.container, force_bool=False)
+ if entity.files:
+ # remove shadows-only files with no URLs
+ entity.files = [f for f in entity.files
+ if not (f.extra and f.extra.get('shadows') and not f.urls)]
if entity.filesets:
for fs in entity.filesets:
fs._total_size = sum([f.size for f in fs.manifest])
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 7c60a6dd..6b2b9cc1 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -77,7 +77,7 @@ def do_release_search(q, limit=30, fulltext_only=True, offset=0):
"default_operator": "AND",
"analyze_wildcard": True,
"lenient": True,
- "fields": ["title^5", "contrib_names^2", "container_title"],
+ "fields": ["biblio"],
},
},
}
@@ -106,7 +106,7 @@ def do_container_search(q, limit=30, offset=0):
"default_operator": "AND",
"analyze_wildcard": True,
"lenient": True,
- "fields": ["name^5", "publisher"],
+ "fields": ["biblio"],
},
},
}
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 83ecd1c8..961b4759 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -196,8 +196,9 @@
</tbody>
</table>
{% else %}
-<p>There are no known files associated with this release (you could try
-<a href="/work/{{ release.work_id }}">other releases for this work?</a>).
+<p>There are no accessible files associated with this release. You could check
+<a href="/work/{{ release.work_id }}">other releases for this work</a> for an
+accessible version.
{% endif %}
{% endif %}
diff --git a/python/tests/files/changelog_3469683.json b/python/tests/files/changelog_3469683.json
new file mode 100644
index 00000000..7a847b16
--- /dev/null
+++ b/python/tests/files/changelog_3469683.json
@@ -0,0 +1 @@
+{"index":3469683,"editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","timestamp":"2020-01-30T05:04:39.738601Z","editgroup":{"editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","editor_id":"scmbogxw25evtcesfcab5qaboa","editor":{"editor_id":"scmbogxw25evtcesfcab5qaboa","username":"crawl-bot","is_admin":true,"is_bot":true,"is_active":true},"changelog_index":3469683,"created":"2020-01-30T05:04:39.738601Z","description":"Files crawled from web using sandcrawler ingest tool","extra":{"agent":"fatcat_tools.IngestFileResultImporter","git_rev":"v0.3.1-280-ga889f32"},"edits":{"containers":[],"creators":[],"files":[{"edit_id":"ba819a2b-a4d0-43e6-9e5c-505284c8ae42","ident":"e3lmbzqyjjam3a5nqnccc6d654","revision":"7a606095-9d07-41ee-898a-bcf8b6bc0004","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/23802359.2020.1715878"}},{"edit_id":"a71c4b91-d599-4422-a7a6-527562161278","ident":"e62h2fa6fba6ve3lukv7n635fq","revision":"1374f1bd-684a-48b9-aaff-65b9e90083b5","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1186/s13071-020-3909-6"}},{"edit_id":"82e4d65d-0335-4a40-b9c9-bf38f9bd7b19","ident":"fam7ii245zasvnesikw7bhmoii","revision":"327e3358-2b2b-4919-9613-449bdbb76c55","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i1.8294"}},{"edit_id":"49e987dc-fc6f-4391-8b75-33176f03b5cb","ident":"fa6sljsebjapfojqapxd3dj4um","revision":"7f51f4fa-e448-410a-b7d5-7ae9cdf9fcb8","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33536/jcpe.v4i1.293"}},{"edit_id":"e62ed1ca-3961-423b-ab05-967694e32f70","ident":"fhhzbabf3zcx7p2tor2omqveyq","revision":"d40c3f8c-255f-4913-ace8-06db6af66697","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.59"}},{"edit_id":"337b8ed6-1248-4872-81da-d16f6db021e6","ident":"fllcoo4smfdyrh5q5lmu72e7cq","revision":"724b26cd-6ab1-46a1-bc88-a87410fdf102","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/1343943x.2020.1717970"}},{"edit_id":"43a0c9d5-692e-4c73-b154-d8769854d268","ident":"fzshcc6sfzegbduum2763o3lgy","revision":"a8f5e34b-fbf4-4d52-987e-887455e6bd50","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i1.8099"}},{"edit_id":"48767850-1141-47e3-80ef-556605e3588c","ident":"grdztt2vwjd65ovcifeo3ysbam","revision":"12dceb74-6333-4a17-b1e8-4afac9df1888","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.26565/2220-8089-2019-36-02"}},{"edit_id":"4b3b92ac-c250-4b78-9e91-43fc893c935e","ident":"hgpzsozky5amvcts45qb6nhqum","revision":"b5951fd6-dd36-422c-9770-8cdfb7e6d82d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934410"}},{"edit_id":"dabff0df-6f52-4adb-a63f-1965f30d8bd2","ident":"hpxdh7mykng77jyfoolpixzw2y","revision":"0e06e57c-0756-4ee5-8102-0d96478aa23f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.26452/ijrps.v11i1.1916"}},{"edit_id":"b757f2b5-5f97-45b8-bfcd-9c9fb08047b8","ident":"htorhznxdfdppbvpf57nrz536q","revision":"034f4640-fbb6-4123-9ad8-f934220ab820","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra09196c"}},{"edit_id":"5662469f-a4d3-4339-88a0-4d27ef3f5f58","ident":"if6a63p7rnaxbjkcr4egtihj74","revision":"08d384cf-396c-41e1-a14d-6bde24b47323","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1192/bjb.2019.92"}},{"edit_id":"540a1ec1-7879-42f4-b749-d3287eb26ef7","ident":"i76ou5g2jndo7gyjrxdhoks3bm","revision":"eddbf701-59c3-4f33-b26b-87ad29297f65","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934355"}},{"edit_id":"04a0b805-b09f-4ff6-80b3-77cb643c72d7","ident":"jl2z2az4sjfpdigdts3xjm24vy","revision":"a541e130-07be-43ae-8fbf-b0295d5b576f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/23802359.2020.1719936"}},{"edit_id":"3e4566b9-7e3f-4e02-aa81-b6299545b150","ident":"jsrgx72devbadbyyiqwm2bl4aa","revision":"f51f164e-0e96-4245-973b-3408250ebc3b","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.09"}},{"edit_id":"80c2f0ca-ba43-4481-b5a5-c15d87d4d7b4","ident":"kijz6wlf25dito3av5snrz345a","revision":"fa1b5279-2221-4063-853e-070ca2d5954b","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s2045796020000037"}},{"edit_id":"c0f44f12-e4c7-45d4-a79a-a3ce2d628e78","ident":"ky3cjfbzejecxaa5tjaaucurb4","revision":"0e227663-f825-42e8-bf57-ba50fea60bf5","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s204579602000013x"}},{"edit_id":"92093947-7ff5-48ad-bc47-b26d4c0959c8","ident":"llfciusk6jf6rd35ofpuufmgfe","revision":"ed89f8b0-98cd-4886-b99e-b5d276b2ac9f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934349"}},{"edit_id":"4cb0e042-4298-4d0b-98b8-3c0c808642ea","ident":"mdw33cq7svdfnlaevnpih7bsyu","revision":"4a0ed3ce-56f2-4299-861c-051e7c06499d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i2.8073"}},{"edit_id":"611fc1e1-79e1-4e10-a702-0420919407f6","ident":"mwghms3u2zecdf2x5zk7tzs4mq","revision":"101d7d0f-6aca-4df3-a08a-f4a3f26d3cd4","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1186/s12864-020-6509-0"}},{"edit_id":"b9a350bd-3497-4c83-97c1-0a20a420a287","ident":"ooejwh3g35cfrmmk4bvjcqxrai","revision":"8a4e8a05-3fb4-4d83-a545-35f152e24f58","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.6018/cpd.340501"}},{"edit_id":"aeab2a58-ec4e-4a2a-82ee-a0d2428eca50","ident":"pepmo2ajfzh7ldtqdugj4p5zvy","revision":"7d13f42d-ce94-4537-8c56-0158d9bf99e9","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.6018/cpd.347981"}},{"edit_id":"f3cd1575-fcfd-4f6b-ab48-28b06592df85","ident":"qcw5h7c5uncbbphtrp3zibn5y4","revision":"fa7b2868-777e-4046-98ca-b146c0056180","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s1742170519000425"}},{"edit_id":"3c0ffa45-7f39-4393-a73c-0615bf9543f6","ident":"qfmufcdlrbfyhcjemvut5om23i","revision":"fac63461-4790-49a8-9b75-b23e9a369529","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.34225/jidc.2019.14.2.57"}},{"edit_id":"4ad40f3a-55f4-4aa8-9016-d04f904b2163","ident":"qrpp45vopfbvzanalh4lmpiz24","revision":"2ac1355f-cba9-49ed-b260-9eea5ec09473","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.19"}},{"edit_id":"1ec3a67b-796c-4393-93c2-ddac4ea66e0f","ident":"q7cv6lezvjalpg5xd4tckkbmsi","revision":"702adc9f-9ecb-4030-a095-936f15839c31","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934359"}},{"edit_id":"a8ab2cf7-8aad-4060-b281-371d41df32cf","ident":"rqcd44zbfvcovipfpnqpowiq2e","revision":"bd51fd60-438a-42fe-80e9-104705b0580f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934350"}},{"edit_id":"6d626d5c-1c7c-42e3-8d00-c313a91fc7c1","ident":"ruzbactehngbrlyx4vq3zlaqlu","revision":"5da34289-7da0-42ad-98bb-1c0b21bb6e69","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.55"}},{"edit_id":"4651ac6b-228c-4918-ac74-0757ce64c031","ident":"rvbhhfvwc5dkhdpxekutlew7di","revision":"53b088f6-653c-4e6d-974a-8e5e6e154f12","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0190"}},{"edit_id":"156b4a96-af8b-4c01-a8ec-6aa53736450f","ident":"r6j7ad7vmvg75lvel6v4e4gbb4","revision":"46647620-a36a-482b-a651-492a4e6ca1bb","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.53"}},{"edit_id":"67e6f76f-859d-422d-8c90-715b3173e24c","ident":"scajd3ykrjatbjmifvxwcl3yhu","revision":"d99f1da3-484a-4977-bc14-6e952d007acd","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.11"}},{"edit_id":"22395a5e-2a2e-4bf2-9547-0fb5122f8a7f","ident":"sk22wngc3fh23b74nmkbgeeyya","revision":"7423067d-3a18-4305-98b4-5cee8127e24a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0300"}},{"edit_id":"4ea1f9dd-d85e-450b-bc2a-995364fcf3fe","ident":"tma3dvw77bghjfaqrdoiwquuge","revision":"c86364bf-501b-42b4-a3ed-ef334ff26e0a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.36447/estudios2014.v33-34.art3"}},{"edit_id":"87d158ff-b0e0-4944-88d9-017dcef70d6e","ident":"tvuumx4n75dorebv7bu7imyiym","revision":"ae17166e-43ce-4d5a-95de-4befeeb76fa6","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.35236/jots.663726"}},{"edit_id":"2114c75b-ac23-471a-83d9-c76ae5f6bf42","ident":"twnql5u4mbfqffal2atv5qucoq","revision":"cb73ac6c-bc2b-422b-bc27-7032d14cac4d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0162"}},{"edit_id":"2f1d20fd-e69e-4490-9f70-f2b043f53634","ident":"tzwyuimcgrepvhide2sj3lovjm","revision":"b9cf9229-db47-4fa0-a8f6-45320b5af440","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934348"}},{"edit_id":"fdb6c388-9b22-45ff-8db1-1531eac43bbf","ident":"vfwz3fuvbbgcxek2hi6vjo525q","revision":"6c57ecb2-a870-4078-af17-a779ca3ceb28","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.4197/eco.29-1.8"}},{"edit_id":"3b8cf6fa-75e7-4b15-a83c-784552ff76e9","ident":"vtq5tvfltbfv3pizvux32ek5hi","revision":"7c0b5d64-6d20-407b-9de3-482a9c75f40e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jurisprudence.v8i2.6977"}},{"edit_id":"899303e8-ce86-4f53-909f-249ef45d9a3b","ident":"vuneewh3fncxpb7bzviptx6kze","revision":"3fd4ff1c-1855-41e2-9bae-7edbae86783a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.3846/jcem.2020.11826"}},{"edit_id":"c0595f10-72dc-4144-b243-5b344912842f","ident":"v2i53kwtnbea7kfhsnvcpkvixe","revision":"09891662-3d67-4d7f-a4ed-7f9bfa3f426f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934347"}},{"edit_id":"80f26273-d243-4dcf-af4a-e05226ba679c","ident":"wtish6c32randpuucjxq5byjo4","revision":"a69cd59a-52bf-4c48-9323-62a4070b2440","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.29261/pakvetj/2019.033"}},{"edit_id":"79f1bdc1-2627-4328-97ff-71731295fcad","ident":"xkf7a4cavnhahnfsjj5w5aoopu","revision":"b826cb89-efcc-427f-a378-0829bb2b871c","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.36447/estudios2014.v33-34.art6"}},{"edit_id":"695feb37-ab8e-4ecd-8c9e-fae06ff63e39","ident":"yx24fslfafb6dgx7gvjbmoma5m","revision":"4109aeb1-fcc1-47f3-9d2c-8e3abfbc697e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra10366j"}},{"edit_id":"48f2eb73-a5a3-440a-8bd1-c581797a82ca","ident":"3p5rah3wbfftdht7rabkpjfcrm","revision":"df754aec-4750-45eb-97e6-943928dad661","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.01"}},{"edit_id":"7a937ed3-5362-40ed-8f96-e6c6231e0adf","ident":"4r3madqhfzb5jb7jd7xmv55em4","revision":"6feccd54-38bd-4d51-8f42-fe211baf5ba3","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.35236/jots.668781"}},{"edit_id":"3beaa94e-c4e1-4a6b-96a7-d455ad13b7aa","ident":"53ooeweri5efjm5vhl2bwjcfze","revision":"a9d318ad-eb2a-4590-a463-8d6016e2e887","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s2045796020000049"}},{"edit_id":"8a0b80e2-c8b2-4457-ac89-2fc7ad7548f7","ident":"546k37iji5bfffakw2egl2azxy","revision":"3a4d93d0-c59e-4f81-8fbb-40ce21b11b1e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra08019h"}},{"edit_id":"15fa5e34-9829-483e-bfa5-a4011b974c6b","ident":"6cy3aonbdfgxbjxnujg3hsqx7q","revision":"021fed8f-5109-4389-9e7f-13a70cbaf4a3","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.2478/joim-2019-0023"}},{"edit_id":"6bc507c8-ceea-49a5-8c1f-9c652463588e","ident":"6rxwlcytwzeopgrhidvi236b2q","revision":"60307cd5-28a3-4063-9111-d5e90e1cb346","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.3846/aviation.2019.11913"}},{"edit_id":"283013c6-4400-4fdc-b2b2-1dbd1a262332","ident":"7j4w24plxzc3nnrkorbowmnra4","revision":"0fa4112c-a596-49a2-9d36-82c213db3fb8","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.22"}}],"filesets":[],"webcaptures":[],"releases":[],"works":[]}}} \ No newline at end of file
diff --git a/python/tests/files/example_shadow.json b/python/tests/files/example_shadow.json
new file mode 100644
index 00000000..3386f481
--- /dev/null
+++ b/python/tests/files/example_shadow.json
@@ -0,0 +1,10 @@
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"12703034","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","doi":"10.1371/journal.pmed.0020124","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","sha256hex":"b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79","md5hex":"debd8db178fa08a7a0aaec6e42832a8e","size_bytes":206121,"mimetype":"application/pdf"},"cdx":{"url":"https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf","datetime":"20180729135948","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"UNPAYWALL-PDF-CRAWL-2018-07-20180729132538992-15980-16048-wbgrp-svc281/UNPAYWALL-PDF-CRAWL-2018-07-20180729135708800-16009-11693~wbgrp-svc281.us.archive.org~8443.warc.gz","warc_csize":32497,"warc_offset":105265425,"row_created":"2019-08-09T23:25:44.571943+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"51052483","sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","doi":"10.1191/0266355403gh289oa","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","sha256hex":"57ce460db4410b9bfaf500ed652fd29e64d46b40c17e28f1156ba03736edf91b","md5hex":"96133eec3a6c533993213e7bdf446251","size_bytes":164344,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"2476283","sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","doi":"10.1016/0042-207x(62)90512-2","pmid":"54321","isbn13":null},"file_meta":{"sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","sha256hex":"e8d0c607b024ff6ffd58a35f76c454844b70ad19fe3f78a573af1ae53f53ad9d","md5hex":"b53318522b9f35a42b7e53f150fe70b2","size_bytes":116735,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"8760871","sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","doi":"10.1016/s0042-207x(79)80945-8","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","sha256hex":"8a69b4a6dff98682ad43e7d4139221c1557c1bd202b615490af8a2c7dcbb71d2","md5hex":"29e1cfac8ecfbc8be57a1ec8b465c4be","size_bytes":138218,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"11473618","sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","doi":"10.1038/ng.2339","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","sha256hex":"a72517e8e72d78bc07a6ef7ff3a6d1d3e04325df986cb8f1bbb4e809f7a9dbdd","md5hex":"9cb8a6e056c9cc740d3bed0c50cd53dc","size_bytes":80992,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"47301218","sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","doi":"10.2307/23406551","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","sha256hex":"315f1d39a00ccf256fa15d92a14869dbda48d31500989aaacb11368f906a5827","md5hex":"8141b42ec3bb41fa87099633a1b61d93","size_bytes":305236,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"30603850","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","doi":"10.1109/spire.1998.712983","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","sha256hex":"777e2c472e9d2fec3bbd26bad788562cf1e08e5850315c25cfb6e46d38e7e4af","md5hex":"3a3c92fabaf6cf437bb596d9e9255ff6","size_bytes":113768,"mimetype":"application/pdf"},"cdx":{"url":"http://proteomics.bioprojects.org/pavel/papers/SST_versus_EST_in_gene_recognition..pdf","datetime":"20081121222143","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"1227992340180_31-c/1227992509265_9.arc.gz","warc_csize":61212,"warc_offset":62956683,"row_created":"2020-01-07T02:06:33.965383+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"9311918","sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","doi":"10.1111/j.1439-0507.2008.01572.x","pmid":"18721331","isbn13":null},"file_meta":{"sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","sha256hex":"713758ce0417f604c0a4b0bf5b5eea571a9b08ca4cc81a98d602c43f42abfe37","md5hex":"0df123e6305c617ffd38ebef90b1e318","size_bytes":178664,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"7757772","sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","doi":"10.1007/s10464-011-9424-3","pmid":"21287262","isbn13":null},"file_meta":{"sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","sha256hex":"ee1bce27134ae55b3d67f9b31f66571e41ac496fc3fb526dec2d53513b8f6deb","md5hex":"e72c5cf3d61635821e78ca0306c98887","size_bytes":337857,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"74272862","sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","doi":"10.1002/slct.201802783","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","sha256hex":"f277eefc7b1466df814a7a892ab8e2e7f08db1faae0bf73b893211e5f5b37193","md5hex":"27534b8494f54ba5de47c16fb2590b04","size_bytes":1372272,"mimetype":"application/pdf"},"cdx":null}
diff --git a/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json b/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json
new file mode 100644
index 00000000..bed8977d
--- /dev/null
+++ b/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json
@@ -0,0 +1 @@
+{"release_ids":["5tbuas2e4vd6jaowbgzmmhhqxe"],"mimetype":"application/pdf","urls":[{"url":"https://web.archive.org/web/20200130042753/https://www.zhros.ru/jour/article/download/811/542","rel":"webarchive"},{"url":"https://www.zhros.ru/jour/article/download/811/542","rel":"web"}],"sha256":"1665cdb90b73c684233038601c52995acef77bb37aefc6e63ae13e4194d48261","sha1":"3ad4df99ff1354ec0b5a333a59fba9a3a5d9812a","md5":"39159f9c8e98a245f954c9000b0f2810","size":739980,"revision":"dcc7a975-725d-4bc9-8c3f-cd0476cd485e","ident":"bcah4zp5tvdhjl5bqci2c2lgfa","state":"active"} \ No newline at end of file
diff --git a/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json b/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json
new file mode 100644
index 00000000..1204c95d
--- /dev/null
+++ b/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json
@@ -0,0 +1 @@
+{"abstracts":[],"refs":[{"index":0,"extra":{"issue":"Suppl 1","volume":"118"},"key":"10.1111/j.1471-0528.2011.03098.x-BIB1|cit1","year":2011,"container_name":"BJOG","title":"Saving Mothers' Lives: reviewing maternal deaths to make motherhood safer-2006-2008. The Eighth Report of the Confidential Enquiries into Maternal Deaths in the United Kingdom"}],"contribs":[{"index":0,"raw_name":"Philip Steer","role":"author","extra":{"seq":"first"}}],"language":"en","publisher":"Wiley","pages":"1404-1404","issue":"11","volume":"118","ext_ids":{"doi":"10.1111/j.1471-0528.2011.03098.x"},"release_year":2011,"release_date":"2011-09-09","release_stage":"published","release_type":"article-journal","container_id":"hl5g6d5msjcl7hlbyyvcsbhc2u","webcaptures":[],"filesets":[],"files":[],"container":{"wikidata_qid":"Q15724571","issnl":"1470-0328","publisher":"Wiley (Blackwell Publishing)","container_type":"journal","name":"BJOG: an International Journal of Obstetrics and Gynaecology","extra":{"abbrev":"BJOG","country":"gb","ia":{"sim":{"year_spans":[[1902,1915],[1921,2015]]}},"issne":"1471-0528","issnp":"1470-0328","kbart":{"clockss":{"year_spans":[[1989,1989],[1993,1993],[2002,2003],[2009,2017]]},"portico":{"year_spans":[[1902,2019]]}},"languages":["en"],"sherpa_romeo":{"color":"yellow"},"urls":["http://www.bjog.org/view/0/index.html"]},"revision":"ec26766c-c1fe-453b-837d-087cc254fe07","ident":"hl5g6d5msjcl7hlbyyvcsbhc2u","state":"active"},"work_id":"wmwe5wwkzfcs7gyjfgdeanksha","title":"Saving Mothers' Lives. Reviewing maternal deaths to make motherhood safer: 2006-2008","state":"active","ident":"etodop5banbndg3faecnfm6ozi","revision":"deb7e050-6df6-42ed-9704-788a0e30facf","extra":{"crossref":{"type":"journal-article"},"subtitle":["Correpondence"]}} \ No newline at end of file
diff --git a/python/tests/import_shadow.py b/python/tests/import_shadow.py
new file mode 100644
index 00000000..70a918d2
--- /dev/null
+++ b/python/tests/import_shadow.py
@@ -0,0 +1,61 @@
+
+import json
+import pytest
+from fatcat_tools.importers import ShadowLibraryImporter, JsonLinePusher
+from fixtures import api
+
+
+@pytest.fixture(scope="function")
+def shadow_importer(api):
+ yield ShadowLibraryImporter(api)
+
+# TODO: use API to check that entities actually created...
+def test_shadow_importer_basic(shadow_importer):
+ with open('tests/files/example_shadow.json', 'r') as f:
+ JsonLinePusher(shadow_importer, f).run()
+
+def test_shadow_importer(shadow_importer):
+ last_index = shadow_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_shadow.json', 'r') as f:
+ shadow_importer.bezerk_mode = True
+ counts = JsonLinePusher(shadow_importer, f).run()
+ assert counts['insert'] == 2
+ assert counts['exists'] == 0
+ assert counts['skip'] == 8
+
+ # fetch most recent editgroup
+ change = shadow_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "shadow library" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.ShadowLibraryImporter" in eg.extra['agent']
+
+ # re-insert; should skip
+ with open('tests/files/example_shadow.json', 'r') as f:
+ shadow_importer.reset()
+ shadow_importer.bezerk_mode = False
+ counts = JsonLinePusher(shadow_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 2
+ assert counts['skip'] == 8
+
+def test_shadow_dict_parse(shadow_importer):
+ with open('tests/files/example_shadow.json', 'r') as f:
+ raw = json.loads(f.readline())
+ f = shadow_importer.parse_record(raw)
+
+ assert f.sha1 == "0000002922264275f11cca7b1c3fb662070d0dd7"
+ assert f.md5 == "debd8db178fa08a7a0aaec6e42832a8e"
+ assert f.sha256 == "b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79"
+ assert f.mimetype == "application/pdf"
+ assert f.size == 206121
+ assert len(f.urls) == 2
+ for u in f.urls:
+ if u.rel == "publisher":
+ assert u.url.startswith("https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf")
+ if u.rel == "webarchive":
+ assert u.url.startswith("https://web.archive.org/")
+ assert "20180729135948" in u.url
+ assert len(f.release_ids) == 1
+
diff --git a/python/tests/transform_tests.py b/python/tests/transform_elasticsearch.py
index f254e117..a954fc4d 100644
--- a/python/tests/transform_tests.py
+++ b/python/tests/transform_elasticsearch.py
@@ -7,6 +7,7 @@ from fixtures import api
from import_journal_metadata import journal_metadata_importer
from import_crossref import crossref_importer
+from import_matched import matched_importer
def test_basic_elasticsearch_convert(crossref_importer):
with open('tests/files/crossref-works.single.json', 'r') as f:
@@ -72,14 +73,67 @@ def test_rich_elasticsearch_convert():
assert es['ref_count'] == 2
assert es['ref_linked_count'] == 1
-def test_elasticsearch_from_json():
- r = entity_from_json(open('./tests/files/math_universe.json', 'r').read(), ReleaseEntity)
- release_to_elasticsearch(r)
+def test_elasticsearch_release_from_json():
+ r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity)
+ es = release_to_elasticsearch(r)
+
+ assert es['subtitle'] == "Correpondence"
+ assert es['ident'] == "etodop5banbndg3faecnfm6ozi"
+ assert es['container_name'] == "BJOG: an International Journal of Obstetrics and Gynaecology"
+ assert es['first_page'] == "1404"
+ assert es['issue'] == "11"
+ assert es['volume'] == "118"
+ assert es['number'] == None
+ assert es['in_ia_sim'] == True
+ assert es['in_kbart'] == True
-def test_elasticsearch_container_convert(journal_metadata_importer):
+def test_elasticsearch_container_transform(journal_metadata_importer):
with open('tests/files/journal_metadata.sample.json', 'r') as f:
raw = json.loads(f.readline())
c = journal_metadata_importer.parse_record(raw)
c.state = 'active'
es = container_to_elasticsearch(c)
assert es['publisher'] == c.publisher
+
+def test_elasticsearch_file_transform(matched_importer):
+ f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity)
+
+ f.state = 'active'
+ es = file_to_elasticsearch(f)
+ assert es['sha1'] == f.sha1
+ assert es['sha256'] == f.sha256
+ assert es['md5'] == f.md5
+ assert es['size_bytes'] == f.size
+ assert es['mimetype'] == f.mimetype
+ assert es['in_ia'] == True
+
+ assert 'web' in es['rels']
+ assert 'www.zhros.ru' in es['hosts']
+ assert 'zhros.ru' in es['domains']
+ assert 'archive.org' in (es['hosts'] + es['domains'])
+ assert 'web.archive.org' in (es['hosts'] + es['domains'])
+ # old regression
+ assert not '.archive.org' in (es['hosts'] + es['domains'])
+
+def test_elasticsearch_changelog_transform(matched_importer):
+ ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
+
+ es = changelog_to_elasticsearch(ce)
+ assert es['index'] == 3469683
+ # len("2020-01-30T05:04:39") => 19
+ assert es['timestamp'][:19] == "2020-01-30T05:04:39.738601Z"[:19]
+ assert es['editor_id'] == "scmbogxw25evtcesfcab5qaboa"
+ assert es['username'] == "crawl-bot"
+ assert es['is_bot'] == True
+ assert es['is_admin'] == True
+ assert es['agent'] == "fatcat_tools.IngestFileResultImporter"
+
+ assert es['total'] == 50
+ assert es['files'] == 50
+ assert es['new_files'] == 50
+ assert es['created'] == 50
+
+ assert es['releases'] == 0
+ assert es['new_releases'] == 0
+ assert es['updated'] == 0
+ assert es['deleted'] == 0