summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md31
-rw-r--r--extra/stats/2020-12-07-prod-stats.json1
-rw-r--r--extra/stats/2020-12-07-prod-tables-sizes.txt46
-rw-r--r--extra/stats/2020-12-23-prod-stats.json1
-rw-r--r--extra/stats/2020-12-23-prod-table-sizes.txt46
-rw-r--r--extra/stats/2020-12-29-prod-stats.json1
-rw-r--r--extra/stats/2020-12-29-prod-table-sizes.txt46
-rw-r--r--notes/bulk_edits/2020-12-14_doaj.md15
-rw-r--r--notes/bulk_edits/2020-12-23_dblp.md55
-rw-r--r--notes/bulk_edits/CHANGELOG.md9
-rw-r--r--python/fatcat_tools/importers/common.py2
-rw-r--r--python/fatcat_tools/importers/dblp_release.py11
-rw-r--r--python/fatcat_tools/importers/ingest.py6
-rw-r--r--python/fatcat_tools/normal.py4
-rw-r--r--rust/Cargo.lock53
-rw-r--r--rust/Cargo.toml4
-rw-r--r--rust/fatcat-openapi/Cargo.toml6
17 files changed, 281 insertions, 56 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4757a85a..6c9728c7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,17 +14,28 @@ See also:
- [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
-## [Unreleased]
-This is a patch release containing many web interface features and tweaks. No
-changes to API.
+## [0.3.3] - 2020-12-24
+
+Minor additions to the API schema: new external identifiers for release
+entities, for `doaj`, `dblp`, and `oai`. Database schema (SQL) not changed.
### Added
+- three new release external identifiers: `doaj`, `dblp`, and `oai` (all
+ article-level). These are API changes, but backwards compatible.
+- DOAJ release import
+- dblp container and release import
- free-form "coverage search" page, allowing visualization of coverage based on
- elasticsearch query
-- editing of all entity types using TOML markup
+ elasticsearch query (web interface)
+- editing of all entity types using TOML markup (via web interface)
- basic sitemap XML generation
+- initial integration of fuzzycat library to prevent duplicate release entity
+ creation at import time
+- import of HTML webcaptures from sandcrawler ingest
+- kafka publishing of updated work entities (transitively of release updates as
+ part of the work), to enable work-level entity update pipeline for archive
+ scholar index
### Changed
@@ -39,16 +50,6 @@ changes to API.
- several other bugfixes to web interface and importer code, not reported here
granularly
-## [0.3.3] - 2020-11-16
-
-Minor additions to the API schema: new external identifiers for release
-entities, for `doaj`, `dblp`, and `oai`. Database schema (SQL) not changed.
-
-### Added
-
-- two new release external identifiers: `doaj`, `dblp`, and `oai` (all
- article-level). These are API changes, but backwards compatible.
-
## [0.3.2] - 2020-04-08
This release was tagged retro-actively; it was the last commit before upgrading
diff --git a/extra/stats/2020-12-07-prod-stats.json b/extra/stats/2020-12-07-prod-stats.json
new file mode 100644
index 00000000..0b3a4e25
--- /dev/null
+++ b/extra/stats/2020-12-07-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5021021,"timestamp":"2020-12-07T20:18:23.820391+00:00"}},"container":{"total":170246},"papers":{"in_kbart":71815549,"in_web":27500452,"in_web_not_kbart":12066730,"is_oa":17640091,"total":113444477},"release":{"refs_total":1014954200,"total":156561297}}
diff --git a/extra/stats/2020-12-07-prod-tables-sizes.txt b/extra/stats/2020-12-07-prod-tables-sizes.txt
new file mode 100644
index 00000000..618232e3
--- /dev/null
+++ b/extra/stats/2020-12-07-prod-tables-sizes.txt
@@ -0,0 +1,46 @@
+Size: 676.35G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 65 GB | 52 GB | 117 GB
+ "public"."release_rev" | 66 GB | 35 GB | 101 GB
+ "public"."refs_blob" | 93 GB | 2885 MB | 96 GB
+ "public"."file_rev" | 31 GB | 38 GB | 69 GB
+ "public"."release_edit" | 15 GB | 22 GB | 37 GB
+ "public"."work_edit" | 15 GB | 21 GB | 36 GB
+ "public"."file_edit" | 15 GB | 20 GB | 34 GB
+ "public"."file_rev_url" | 23 GB | 8437 MB | 32 GB
+ "public"."release_ident" | 10 GB | 15 GB | 25 GB
+ "public"."work_ident" | 10176 MB | 15 GB | 25 GB
+ "public"."abstracts" | 22 GB | 1832 MB | 23 GB
+ "public"."file_rev_release" | 7761 MB | 13 GB | 21 GB
+ "public"."file_ident" | 7159 MB | 13 GB | 20 GB
+ "public"."release_ref" | 5212 MB | 7393 MB | 12 GB
+ "public"."work_rev" | 6618 MB | 5825 MB | 12 GB
+ "public"."release_rev_abstract" | 3150 MB | 3988 MB | 7139 MB
+ "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB
+ "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB
+ "public"."editgroup" | 1145 MB | 647 MB | 1792 MB
+ "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB
+ "public"."changelog" | 352 MB | 302 MB | 654 MB
+ "public"."release_rev_extid" | 221 MB | 339 MB | 561 MB
+ "public"."container_rev" | 164 MB | 41 MB | 205 MB
+ "public"."container_edit" | 46 MB | 56 MB | 102 MB
+ "public"."container_ident" | 12 MB | 25 MB | 36 MB
+ "public"."auth_oidc" | 48 kB | 48 kB | 96 kB
+ "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB
+ "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB
+ "public"."editor" | 48 kB | 48 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/extra/stats/2020-12-23-prod-stats.json b/extra/stats/2020-12-23-prod-stats.json
new file mode 100644
index 00000000..bb27d708
--- /dev/null
+++ b/extra/stats/2020-12-23-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5073319,"timestamp":"2020-12-23T19:01:04.942860+00:00"}},"container":{"total":171139},"papers":{"in_kbart":72019264,"in_web":27596752,"in_web_not_kbart":12138388,"is_oa":18312974,"total":114347662},"release":{"refs_total":1020596562,"total":158051479}}
diff --git a/extra/stats/2020-12-23-prod-table-sizes.txt b/extra/stats/2020-12-23-prod-table-sizes.txt
new file mode 100644
index 00000000..dacdb48c
--- /dev/null
+++ b/extra/stats/2020-12-23-prod-table-sizes.txt
@@ -0,0 +1,46 @@
+Size: 684.08G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 66 GB | 52 GB | 119 GB
+ "public"."release_rev" | 67 GB | 36 GB | 103 GB
+ "public"."refs_blob" | 94 GB | 2885 MB | 97 GB
+ "public"."file_rev" | 31 GB | 38 GB | 69 GB
+ "public"."release_edit" | 16 GB | 22 GB | 37 GB
+ "public"."work_edit" | 15 GB | 21 GB | 36 GB
+ "public"."file_edit" | 15 GB | 20 GB | 34 GB
+ "public"."file_rev_url" | 23 GB | 8456 MB | 32 GB
+ "public"."release_ident" | 10 GB | 15 GB | 26 GB
+ "public"."work_ident" | 10 GB | 15 GB | 25 GB
+ "public"."abstracts" | 23 GB | 1869 MB | 25 GB
+ "public"."file_rev_release" | 7769 MB | 13 GB | 21 GB
+ "public"."file_ident" | 7159 MB | 13 GB | 20 GB
+ "public"."release_ref" | 5277 MB | 7481 MB | 12 GB
+ "public"."work_rev" | 6724 MB | 5825 MB | 12 GB
+ "public"."release_rev_abstract" | 3306 MB | 4222 MB | 7529 MB
+ "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB
+ "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB
+ "public"."editgroup" | 1162 MB | 657 MB | 1820 MB
+ "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB
+ "public"."release_rev_extid" | 314 MB | 469 MB | 783 MB
+ "public"."changelog" | 360 MB | 309 MB | 669 MB
+ "public"."container_rev" | 164 MB | 42 MB | 206 MB
+ "public"."container_edit" | 46 MB | 56 MB | 103 MB
+ "public"."container_ident" | 12 MB | 25 MB | 36 MB
+ "public"."webcapture_rev_cdx" | 2616 kB | 568 kB | 3184 kB
+ "public"."webcapture_edit" | 240 kB | 192 kB | 432 kB
+ "public"."webcapture_rev_url" | 192 kB | 96 kB | 288 kB
+ "public"."webcapture_rev_release" | 80 kB | 136 kB | 216 kB
+ "public"."webcapture_ident" | 88 kB | 112 kB | 200 kB
+ "public"."webcapture_rev" | 144 kB | 56 kB | 200 kB
+ "public"."auth_oidc" | 48 kB | 48 kB | 96 kB
+ "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB
+ "public"."editor" | 48 kB | 48 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/extra/stats/2020-12-29-prod-stats.json b/extra/stats/2020-12-29-prod-stats.json
new file mode 100644
index 00000000..9dc898b9
--- /dev/null
+++ b/extra/stats/2020-12-29-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5136808,"timestamp":"2020-12-30T03:21:36.540949+00:00"}},"container":{"total":176451},"papers":{"in_kbart":72218853,"in_web":28188176,"in_web_not_kbart":12637639,"is_oa":18905156,"total":115502451},"release":{"refs_total":1023233385,"total":159642952}}
diff --git a/extra/stats/2020-12-29-prod-table-sizes.txt b/extra/stats/2020-12-29-prod-table-sizes.txt
new file mode 100644
index 00000000..6d942f18
--- /dev/null
+++ b/extra/stats/2020-12-29-prod-table-sizes.txt
@@ -0,0 +1,46 @@
+Size: 691.30G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 67 GB | 53 GB | 120 GB
+ "public"."release_rev" | 68 GB | 36 GB | 104 GB
+ "public"."refs_blob" | 94 GB | 2885 MB | 97 GB
+ "public"."file_rev" | 31 GB | 38 GB | 69 GB
+ "public"."release_edit" | 16 GB | 22 GB | 38 GB
+ "public"."work_edit" | 15 GB | 21 GB | 36 GB
+ "public"."file_edit" | 15 GB | 20 GB | 35 GB
+ "public"."file_rev_url" | 24 GB | 8550 MB | 32 GB
+ "public"."release_ident" | 10 GB | 15 GB | 26 GB
+ "public"."work_ident" | 10 GB | 15 GB | 25 GB
+ "public"."abstracts" | 23 GB | 1871 MB | 25 GB
+ "public"."file_rev_release" | 7812 MB | 13 GB | 21 GB
+ "public"."file_ident" | 7159 MB | 13 GB | 20 GB
+ "public"."release_ref" | 5278 MB | 7483 MB | 12 GB
+ "public"."work_rev" | 6761 MB | 5825 MB | 12 GB
+ "public"."release_rev_abstract" | 3318 MB | 4240 MB | 7558 MB
+ "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB
+ "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB
+ "public"."editgroup" | 1182 MB | 674 MB | 1856 MB
+ "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB
+ "public"."webcapture_rev_cdx" | 1465 MB | 279 MB | 1744 MB
+ "public"."release_rev_extid" | 478 MB | 713 MB | 1191 MB
+ "public"."changelog" | 369 MB | 317 MB | 686 MB
+ "public"."container_rev" | 165 MB | 42 MB | 207 MB
+ "public"."container_edit" | 47 MB | 58 MB | 104 MB
+ "public"."webcapture_edit" | 24 MB | 16 MB | 40 MB
+ "public"."container_ident" | 12 MB | 25 MB | 37 MB
+ "public"."webcapture_rev_url" | 20 MB | 6744 kB | 26 MB
+ "public"."webcapture_rev_release" | 7072 kB | 11 MB | 18 MB
+ "public"."webcapture_rev" | 14 MB | 4352 kB | 18 MB
+ "public"."webcapture_ident" | 7928 kB | 8768 kB | 16 MB
+ "public"."auth_oidc" | 48 kB | 48 kB | 96 kB
+ "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB
+ "public"."editor" | 48 kB | 48 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/notes/bulk_edits/2020-12-14_doaj.md b/notes/bulk_edits/2020-12-14_doaj.md
index 64a80fda..5e897183 100644
--- a/notes/bulk_edits/2020-12-14_doaj.md
+++ b/notes/bulk_edits/2020-12-14_doaj.md
@@ -122,3 +122,18 @@ ahead with the full import; note that other ingest is happening in parallel
zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
# started 2020-12-17 22:01 (Pacific)
+
+ => 5.45M 52:38:45 [28.8 /s]
+ => Counter({'total': 1366458, 'exists': 1020295, 'insert': 200249, 'exists-fuzzy': 144334, 'skip': 1563, 'skip-title': 1563, 'skip-doaj-id-mismatch': 17, 'update': 0})
+
+As total estimates:
+
+- total: 5,465,832
+- exists: 4,081,180
+- exists-fuzzy: 577,336
+- insert: 800,996
+
+Ending database size: Size: 684.08G
+
+(note that regular imports were running during same period)
+
diff --git a/notes/bulk_edits/2020-12-23_dblp.md b/notes/bulk_edits/2020-12-23_dblp.md
new file mode 100644
index 00000000..c3ad0587
--- /dev/null
+++ b/notes/bulk_edits/2020-12-23_dblp.md
@@ -0,0 +1,55 @@
+
+## Prod Container Import
+
+Using 2020-11-30 XML dump, then scrape and transform tooling from
+`extra/dblp/`.
+
+ wget https://archive.org/download/dblp-xml-2020-11-30/dblp_container_meta.json
+
+ # updated ISSN-to-ISSN-L.txt symlink to 20201207.ISSN-to-ISSN-L.txt
+
+ touch /srv/fatcat/datasets/blank_dblp_containers.tsv
+
+Create new `dblp-bot` user:
+
+ ./target/release/fatcat-auth create-editor --admin --bot dblp-bot
+ => gwbheb5jfngrxkcad5qgth5cra
+
+ ./target/release/fatcat-auth create-token gwbheb5jfngrxkcad5qgth5cra
+
+Run import:
+
+ # git commit: ec6b366af8df1956e1287cba2e0818b80ce1c518
+
+ export FATCAT_AUTH_WORKER_DBLP=...
+
+ ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file /srv/fatcat/datasets/blank_dblp_containers.tsv --dblp-container-map-output /srv/fatcat/datasets/all_dblp_containers.tsv /srv/fatcat/datasets/dblp_container_meta.json
+ => Got 0 existing dblp container mappings.
+ => Counter({'total': 6954, 'insert': 5202, 'exists': 1752, 'skip': 0, 'update': 0})
+
+ wc -l /srv/fatcat/datasets/all_dblp_containers.tsv
+ 6955 /srv/fatcat/datasets/all_dblp_containers.tsv
+
+## Prod Release Import
+
+Using same 2020-11-30 XML dump. Download to /srv/fatcat/datasets:
+
+ wget https://archive.org/download/dblp-xml-2020-11-30/dblp.dtd
+ wget https://archive.org/download/dblp-xml-2020-11-30/dblp.xml
+
+Run import:
+
+ export FATCAT_AUTH_WORKER_DBLP=...
+
+ ./fatcat_import.py dblp-release --dblp-container-map-file /srv/fatcat/datasets/all_dblp_containers.tsv /srv/fatcat/datasets/dblp.xml --do-updates
+
+ # started 2020-12-23 11:51 (Pacific)
+
+ # restarted/tweaked at least twice
+
+ # finally ended around 2020-12-27 after about... 48 hours?
+
+ => Counter({'total': 7953365, 'has-doi': 4277307, 'skip': 3097418, 'skip-key-type': 2640968, 'skip-update': 2480449, 'exists': 943800, 'update': 889700, 'insert': 338842, 'skip-arxiv-corr': 312872, 'exists-fuzzy': 203103, 'skip-dblp-container-missing': 143578, 'skip-arxiv': 53, 'skip-title': 1})
+
+Starting database size (roughly): Size: 684.08G
+Ending databse size: Size: 690.22G
diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md
index 5f25d769..c5f133f8 100644
--- a/notes/bulk_edits/CHANGELOG.md
+++ b/notes/bulk_edits/CHANGELOG.md
@@ -13,7 +13,14 @@ This file should not turn in to a TODO list!
Updated ORCIDs from 2020 dump. About 2.4 million new `creator` entities.
-Imported DOAJ article metadata from a 2020-11 dump.
+Imported DOAJ article metadata from a 2020-11 dump. Crawled and imported
+several hundred thousand file entities matched by DOAJ identifier. Updated
+journal metadata using chocula took (before the release ingest). Filtered out
+fuzzy-matching papers before importing.
+
+Imported dblp from a 2020 snapshot, both containers (primarily for conferences
+lacking an ISSN) and release entities (primarily conference papers). Filtered
+out fuzzy-matching papers before importing.
## 2020-03
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 2446cdbf..fcbe9ad2 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -164,7 +164,7 @@ class EntityImporter:
self.es_client = kwargs.get('es_client')
if not self.es_client:
- self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
+ self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120)
self._issnl_id_map = dict()
self._orcid_id_map = dict()
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 5cbc95d0..daecd765 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -323,6 +323,15 @@ class DblpReleaseImporter(EntityImporter):
if err.status != 404:
raise err
+ # Just skip all releases with an arxiv_id for now. Have not decided
+ # what to do about grouping works and lookup of un-versioned arxiv_id
+ # yet. Note that this means we will lack coverage of some works which
+ # have an arxiv preprint, but in those cases we will presumably at
+ # least have the pre-print copy/record.
+ if re.ext_ids.arxiv:
+ self.counts["skip-arxiv"] += 1
+ return False
+
# then try other ext_id lookups
if not existing:
for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'):
@@ -361,7 +370,7 @@ class DblpReleaseImporter(EntityImporter):
return False
# logic for whether to do update or skip
- if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv_id:
+ if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv:
self.counts['skip-update'] += 1
return False
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index cd3d53f6..04ff8db6 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -86,7 +86,7 @@ class IngestFileResultImporter(EntityImporter):
self.counts['skip-ingest_request_source'] += 1
return False
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj'):
+ if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'):
self.counts['skip-link-source'] += 1
return False
@@ -437,7 +437,9 @@ class IngestWebResultImporter(IngestFileResultImporter):
if 'revisit_cdx' in row:
terminal_cdx = row['revisit_cdx']
assert terminal_cdx['surt']
- assert terminal_cdx['url'] == terminal['terminal_url']
+ if terminal_cdx['url'] != terminal['terminal_url']:
+ self.counts['skip-terminal-url-mismatch'] += 1
+ return None
wc_cdx = []
# primary resource first
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index d792979d..4218856c 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -94,6 +94,9 @@ def clean_arxiv_id(raw):
- 'arxiv:' prefix
Works with versioned or un-versioned arxiv identifiers.
+
+ TODO: version of this function that only works with versioned identifiers?
+ That is the behavior of fatcat API
"""
if not raw:
return None
@@ -116,6 +119,7 @@ def test_clean_arxiv_id():
assert clean_arxiv_id("math.CA/0611800v2") == "math.CA/0611800v2"
assert clean_arxiv_id("math.CA/0611800") == "math.CA/0611800"
assert clean_arxiv_id("0806.2878v1 ") == "0806.2878v1"
+ assert clean_arxiv_id("cs/0207047") == "cs/0207047"
assert clean_arxiv_id("https://arxiv.org/abs/0806.2878v1") == "0806.2878v1"
assert clean_arxiv_id("arxiv:0806.2878v1") == "0806.2878v1"
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index fc260a6f..29aaee6b 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -274,7 +274,7 @@ dependencies = [
"cookie",
"failure",
"idna 0.1.5",
- "log 0.4.6",
+ "log 0.4.11",
"publicsuffix",
"serde 1.0.117",
"serde_json 1.0.38",
@@ -542,13 +542,13 @@ dependencies = [
[[package]]
name = "env_logger"
-version = "0.6.0"
+version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "afb070faf94c85d17d50ca44f6ad076bce18ae92f0037d350947240a36e9d42e"
+checksum = "f26ecb66b4bdca6c1409b40fb255eefc2bd4f6d135dab3c3124f80ffa2a9661e"
dependencies = [
"atty",
"humantime",
- "log 0.4.6",
+ "log 0.4.11",
"regex 1.4.2",
"termcolor",
]
@@ -624,7 +624,7 @@ dependencies = [
"iron-slog",
"iron-test",
"lazy_static 1.4.0",
- "log 0.4.6",
+ "log 0.3.9",
"macaroon",
"num_cpus",
"rand 0.6.5",
@@ -655,8 +655,8 @@ dependencies = [
"futures",
"hyper 0.10.15",
"iron 0.6.0",
- "lazy_static 0.2.11",
- "log 0.3.9",
+ "lazy_static 1.4.0",
+ "log 0.4.11",
"multipart",
"router",
"serde 1.0.117",
@@ -753,7 +753,7 @@ dependencies = [
"futures",
"http",
"indexmap",
- "log 0.4.6",
+ "log 0.4.11",
"slab",
"string",
"tokio-io",
@@ -784,12 +784,9 @@ checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47"
[[package]]
name = "humantime"
-version = "1.2.0"
+version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ca7e5f2e110db35f93b837c81797f3714500b81d517bf20c431b16d3ca4f114"
-dependencies = [
- "quick-error",
-]
+checksum = "3c1ad908cc71012b7bea4d0c53ba96a8cba9962f048fa68d143376143d863b7a"
[[package]]
name = "hyper"
@@ -824,7 +821,7 @@ dependencies = [
"httparse",
"iovec",
"itoa 0.4.3",
- "log 0.4.6",
+ "log 0.4.11",
"net2",
"time",
"tokio",
@@ -1034,14 +1031,14 @@ version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
dependencies = [
- "log 0.4.6",
+ "log 0.4.11",
]
[[package]]
name = "log"
-version = "0.4.6"
+version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6"
+checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b"
dependencies = [
"cfg-if 0.1.6",
]
@@ -1168,7 +1165,7 @@ dependencies = [
"kernel32-sys",
"lazycell",
"libc",
- "log 0.4.6",
+ "log 0.4.11",
"miow",
"net2",
"slab",
@@ -1462,12 +1459,6 @@ dependencies = [
]
[[package]]
-name = "quick-error"
-version = "1.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0"
-
-[[package]]
name = "quote"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1498,7 +1489,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d746fc8a0dab19ccea7ff73ad535854e90ddb3b4b8cdce953dd5cd0b2e7bd22"
dependencies = [
"antidote",
- "log 0.4.6",
+ "log 0.4.11",
"scheduled-thread-pool",
]
@@ -1732,7 +1723,7 @@ dependencies = [
"futures",
"http",
"hyper 0.12.23",
- "log 0.4.6",
+ "log 0.4.11",
"mime 0.3.13",
"mime_guess 2.0.3",
"serde 1.0.117",
@@ -1846,7 +1837,7 @@ dependencies = [
"httpdate",
"im",
"lazy_static 1.4.0",
- "log 0.4.6",
+ "log 0.4.11",
"rand 0.6.5",
"regex 1.4.2",
"reqwest",
@@ -2298,7 +2289,7 @@ checksum = "b53aeb9d3f5ccf2ebb29e19788f96987fa1355f8fe45ea193928eaaaf3ae820f"
dependencies = [
"bytes",
"futures",
- "log 0.4.6",
+ "log 0.4.11",
]
[[package]]
@@ -2310,7 +2301,7 @@ dependencies = [
"crossbeam-utils 0.6.5",
"futures",
"lazy_static 1.4.0",
- "log 0.4.6",
+ "log 0.4.11",
"mio",
"num_cpus",
"parking_lot 0.7.1",
@@ -2344,7 +2335,7 @@ dependencies = [
"crossbeam-deque",
"crossbeam-utils 0.6.5",
"futures",
- "log 0.4.6",
+ "log 0.4.11",
"num_cpus",
"rand 0.6.5",
"slab",
@@ -2636,7 +2627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "797464475f30ddb8830cc529aaaae648d581f99e2036a928877dfde027ddf6b3"
dependencies = [
"futures",
- "log 0.4.6",
+ "log 0.4.11",
"try-lock",
]
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 3eda4df9..270ab09a 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -21,7 +21,7 @@ uuid = "0.6"
log = { version = "*", features = ["max_level_info", "release_max_level_info"] }
data-encoding = "2.1"
regex = "1"
-lazy_static = "1.0"
+lazy_static = "1"
sha1 = { version = "0.6", features = ["std"] }
macaroon = { git = "https://github.com/bnewbold/libmacaroon-rs", branch = "bnewbold-broken" }
rand = "*"
@@ -50,4 +50,4 @@ cadence = "^0.16"
# Command-line tools
crossbeam-channel = "0.2"
num_cpus = "1"
-env_logger = "*"
+env_logger = "0.8"
diff --git a/rust/fatcat-openapi/Cargo.toml b/rust/fatcat-openapi/Cargo.toml
index 3f25b4c3..20256315 100644
--- a/rust/fatcat-openapi/Cargo.toml
+++ b/rust/fatcat-openapi/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "fatcat-openapi"
-version = "0.3.1"
+version = "0.3.3"
edition = "2018"
authors = ["Bryan Newbold <bnewbold@archive.org>"]
description = "Fatcat is an editable bibliographic database. This OpenAPI code-generated crate container HTTP API models, endpoints, and other auto-generated types useful for both client and server implementations of the catalog API."
@@ -30,8 +30,8 @@ swagger = "0.7"
#
bodyparser = {version = "0.8", optional = true}
url = "1.5"
-lazy_static = "0.2"
-log = "0.3.0"
+lazy_static = "1"
+log = "0.4"
multipart = {version = "0.13", optional = true}
router = {version = "0.6", optional = true}
serde = "1.0"