summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitlab-ci.yml10
-rw-r--r--CHANGELOG.md38
-rw-r--r--extra/docker/Dockerfile.test-base47
-rw-r--r--extra/docker/README.md4
-rw-r--r--extra/stats/2020-02-21-prod-stats.json1
-rw-r--r--extra/stats/2020-02-21-prod-tables-sizes.txt48
-rw-r--r--extra/stats/2020-02-24-prod-dupes.txt5
-rw-r--r--extra/stats/2020-02-24-prod-table-sizes.txt47
-rw-r--r--extra/stats/2020-03-03-prod-stats.json1
-rw-r--r--extra/stats/2020-04-17-prod-stats.json1
-rw-r--r--extra/stats/2020-04-17-prod-table-sizes.txt46
-rw-r--r--notes/bulk_edits/CHANGELOG.md9
-rw-r--r--python/fatcat_tools/workers/changelog.py29
13 files changed, 267 insertions, 19 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 69b28ecd..f0d78a9a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -14,7 +14,7 @@ variables:
PIPENV_VENV_IN_PROJECT: "true"
CARGO_HOME: ".cargo_cache"
-image: "rust:1.42-buster"
+image: "bnewbold/fatcat-test-base:latest"
unified_test:
variables:
@@ -28,9 +28,9 @@ unified_test:
before_script:
- export PATH="$PATH:$CARGO_HOME/bin"
- apt update -qy
- - apt install -y python3-dev python3-pip python3-wheel python3-requests python3-six python3-pytest libsnappy-dev libsodium-dev software-properties-common python3.7 python3.7-dev python3.7-venv python3.7-distutils libpq-dev
- - cargo install diesel_cli --version 1.3.1
- - pip3 install pipenv
+ - apt install -y python3-dev python3-pip python3-wheel python3-requests python3-six python3-pytest libsnappy-dev libsodium-dev software-properties-common python3.7 python3.7-dev python3.7-venv python3.7-distutils libpq-dev pkg-config python3-pytest git
+ - diesel --version || cargo install diesel_cli --version 1.3.1 --no-default-features --features postgres
+ - pipenv --version || python3.7 -m pip install pipenv
- pipenv --version
script:
- rustc --version && cargo --version && diesel --version
@@ -43,7 +43,7 @@ unified_test:
- cargo test -- --test-threads 1
- cargo run --bin fatcatd &
- cd ../python_openapi_client
- - pytest-3
+ # TODO: - pytest-3
- cd ../python
- cp example.env .env
- pipenv install --dev --deploy
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1a669e5f..a3562271 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,15 +16,49 @@ See also:
## [Unreleased]
-## Changed
+### Changed
+
+- require Python 3.7 (upgrade from Python 3.5)
+
+## [0.3.2] - 2020-04-08
+
+This release was tagged retro-actively; it was the last commit before upgrading
+to Python 3.7.
+
+Many small changes and tweaks to importers, web interface, etc were made in
+this release.
+
+### Fixed
+- pubmed importer `text` vs. `get_text()` for HTML tags
+
+### Changed
+
+- minimum rust version now 1.36
- Switch from swagger-codegen to openapi-generator for python client generation
+- switch python Kafka code from pykafka to confluent-kafka
+- update release and container elasticsearch schemas to v03b. Release search is
+ now over "biblio" field, allowing matches on multiple fields at the same time
+- Crossref harvester using 'update-date' not 'index-date' to detect updated documents
+
+### Removed
+
+- OpenSSL support removed from fatcatd (Rust)
-## Added
+#@# Added
- webface endpoints for entity view URLs with an underscore instead of slash,
as a redirect. Eg, `https://fatcat.wiki/release_asdf` =>
`https://fatcat.wiki/release/asdf`. A hack to make copy/paste easier.
+- pagination of search results in web interface
+- sandcrawler daily crawling pipeline, including ingest-file importer and
+ publishing requests to sandcrawler kafka topic
+- "Save Paper Now" feature (using sandcrawler pipeline)
+- Datacite DOI registrar daily harvesting and importing
+- Arxiv daily harvesting, using OAI-PMH worker
+- Pubmed daily harvesting, using FTP worker
+- "file" entity elasticsearch schema (though pipeline not yet running
+ continuously)
## [0.3.1] - 2019-09-18
diff --git a/extra/docker/Dockerfile.test-base b/extra/docker/Dockerfile.test-base
new file mode 100644
index 00000000..a556ed99
--- /dev/null
+++ b/extra/docker/Dockerfile.test-base
@@ -0,0 +1,47 @@
+
+FROM ubuntu:xenial
+
+ENV RUSTUP_HOME=/usr/local/rustup \
+ CARGO_HOME=/usr/local/cargo \
+ PATH=/usr/local/cargo/bin:$PATH \
+ RUST_VERSION=1.42.0 \
+ LC_ALL=C.UTF-8 \
+ LANG=C.UTF-8
+
+
+# Add deadsnakes repo
+RUN set -eux; \
+ apt update -qy; \
+ apt install -y software-properties-common; \
+ add-apt-repository -y ppa:deadsnakes/ppa;
+
+# APT dependencies
+RUN set -eux; \
+ apt update -qy; \
+ apt install -y python3-dev python3-pip python3-wheel python3-requests python3-six python3-pytest libsnappy-dev libsodium-dev software-properties-common python3.7 python3.7-dev python3.7-venv python3.7-distutils wget libpq-dev pkg-config python3-pytest git
+
+# Rust setup from docker-rust debian Dockerfile
+RUN set -eux; \
+ dpkgArch="$(dpkg --print-architecture)"; \
+ rustArch='x86_64-unknown-linux-gnu'; rustupSha256='ad1f8b5199b3b9e231472ed7aa08d2e5d1d539198a15c5b1e53c746aad81d27b' ; \
+ url="https://static.rust-lang.org/rustup/archive/1.21.1/${rustArch}/rustup-init"; \
+ wget "$url"; \
+ echo "${rustupSha256} *rustup-init" | sha256sum -c -; \
+ chmod +x rustup-init; \
+ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION; \
+ rm rustup-init; \
+ chmod -R a+w $RUSTUP_HOME $CARGO_HOME; \
+ rustup --version; \
+ cargo --version; \
+ rustc --version;
+
+# Compile and install diesel
+RUN set -eux; \
+ cargo install diesel_cli --version 1.3.1 --no-default-features --features postgres; \
+ diesel --version
+
+# Install pipenv
+RUN set -eux; \
+ pip3 install pipenv; \
+ pipenv --version
+
diff --git a/extra/docker/README.md b/extra/docker/README.md
index 4fd78e56..1869b354 100644
--- a/extra/docker/README.md
+++ b/extra/docker/README.md
@@ -17,3 +17,7 @@ TODO:
- postgres
- fatcatd (rust)
- kibana
+
+## Test Base Image
+
+ docker build -t bnewbold/fatcat-test-base -f Dockerfile.test-base .
diff --git a/extra/stats/2020-02-21-prod-stats.json b/extra/stats/2020-02-21-prod-stats.json
new file mode 100644
index 00000000..3ab6471f
--- /dev/null
+++ b/extra/stats/2020-02-21-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":3528195,"timestamp":"2020-02-22T05:23:18.082262+00:00"}},"container":{"total":148396},"papers":{"in_kbart":60529767,"in_web":20374670,"in_web_not_kbart":9598464,"is_oa":11547112,"total":105732384},"release":{"refs_total":890869519,"total":143867045}}
diff --git a/extra/stats/2020-02-21-prod-tables-sizes.txt b/extra/stats/2020-02-21-prod-tables-sizes.txt
new file mode 100644
index 00000000..bc756ba7
--- /dev/null
+++ b/extra/stats/2020-02-21-prod-tables-sizes.txt
@@ -0,0 +1,48 @@
+
+Size: 478.37G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 53 GB | 43 GB | 96 GB
+ "public"."release_rev" | 58 GB | 33 GB | 91 GB
+ "public"."refs_blob" | 85 GB | 2884 MB | 88 GB
+ "public"."release_edit" | 14 GB | 20 GB | 34 GB
+ "public"."work_edit" | 13 GB | 20 GB | 34 GB
+ "public"."release_ident" | 9515 MB | 15 GB | 24 GB
+ "public"."work_ident" | 9313 MB | 15 GB | 24 GB
+ "public"."abstracts" | 16 GB | 1504 MB | 18 GB
+ "public"."file_rev_url" | 10235 MB | 3587 MB | 13 GB
+ "public"."work_rev" | 6046 MB | 5825 MB | 12 GB
+ "public"."release_ref" | 3997 MB | 5690 MB | 9686 MB
+ "public"."file_rev" | 3635 MB | 5359 MB | 8994 MB
+ "public"."file_edit" | 3111 MB | 4051 MB | 7162 MB
+ "public"."release_rev_abstract" | 2406 MB | 3342 MB | 5749 MB
+ "public"."file_ident" | 1848 MB | 2505 MB | 4354 MB
+ "public"."file_rev_release" | 1698 MB | 2483 MB | 4181 MB
+ "public"."creator_edit" | 702 MB | 942 MB | 1643 MB
+ "public"."creator_rev" | 695 MB | 719 MB | 1413 MB
+ "public"."editgroup" | 767 MB | 405 MB | 1172 MB
+ "public"."creator_ident" | 474 MB | 648 MB | 1121 MB
+ "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB
+ "public"."changelog" | 220 MB | 214 MB | 434 MB
+ "public"."container_rev" | 75 MB | 23 MB | 98 MB
+ "public"."container_edit" | 25 MB | 31 MB | 56 MB
+ "public"."container_ident" | 11 MB | 19 MB | 30 MB
+ "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."auth_oidc" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."editor" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB
+ "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
+
diff --git a/extra/stats/2020-02-24-prod-dupes.txt b/extra/stats/2020-02-24-prod-dupes.txt
new file mode 100644
index 00000000..7d1d09cf
--- /dev/null
+++ b/extra/stats/2020-02-24-prod-dupes.txt
@@ -0,0 +1,5 @@
+ 19409 doi_ident.dupes.tsv
+ 28530 pmcid_ident.dupes.tsv
+ 463523 pmid_ident.dupes.tsv
+ 2025 sha1_ident.dupes.tsv
+ 10 wikidata_ident.dupes.tsv
diff --git a/extra/stats/2020-02-24-prod-table-sizes.txt b/extra/stats/2020-02-24-prod-table-sizes.txt
new file mode 100644
index 00000000..359cb2f3
--- /dev/null
+++ b/extra/stats/2020-02-24-prod-table-sizes.txt
@@ -0,0 +1,47 @@
+
+Size: 560.76G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 53 GB | 43 GB | 96 GB
+ "public"."release_rev" | 58 GB | 33 GB | 91 GB
+ "public"."refs_blob" | 85 GB | 2884 MB | 88 GB
+ "public"."file_rev" | 23 GB | 26 GB | 49 GB
+ "public"."release_edit" | 14 GB | 20 GB | 34 GB
+ "public"."work_edit" | 13 GB | 20 GB | 34 GB
+ "public"."release_ident" | 9517 MB | 15 GB | 24 GB
+ "public"."work_ident" | 9315 MB | 15 GB | 24 GB
+ "public"."file_edit" | 9555 MB | 14 GB | 24 GB
+ "public"."abstracts" | 16 GB | 1505 MB | 18 GB
+ "public"."file_rev_url" | 13 GB | 4730 MB | 17 GB
+ "public"."file_ident" | 5885 MB | 9480 MB | 15 GB
+ "public"."file_rev_release" | 5515 MB | 9536 MB | 15 GB
+ "public"."work_rev" | 6047 MB | 5825 MB | 12 GB
+ "public"."release_ref" | 3997 MB | 5690 MB | 9686 MB
+ "public"."release_rev_abstract" | 2408 MB | 3343 MB | 5751 MB
+ "public"."creator_edit" | 702 MB | 942 MB | 1643 MB
+ "public"."creator_rev" | 695 MB | 719 MB | 1413 MB
+ "public"."editgroup" | 903 MB | 465 MB | 1368 MB
+ "public"."creator_ident" | 474 MB | 648 MB | 1121 MB
+ "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB
+ "public"."changelog" | 261 MB | 229 MB | 490 MB
+ "public"."container_rev" | 75 MB | 23 MB | 98 MB
+ "public"."container_edit" | 25 MB | 31 MB | 56 MB
+ "public"."container_ident" | 11 MB | 19 MB | 30 MB
+ "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."auth_oidc" | 16 kB | 48 kB | 64 kB
+ "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB
+ "public"."editor" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/extra/stats/2020-03-03-prod-stats.json b/extra/stats/2020-03-03-prod-stats.json
new file mode 100644
index 00000000..0ac977b8
--- /dev/null
+++ b/extra/stats/2020-03-03-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":4242658,"timestamp":"2020-03-03T18:35:06.153130+00:00"}},"container":{"total":148428},"papers":{"in_kbart":60594053,"in_web":22232097,"in_web_not_kbart":10756782,"is_oa":15267353,"total":105933568},"release":{"refs_total":893136234,"total":144138471}}
diff --git a/extra/stats/2020-04-17-prod-stats.json b/extra/stats/2020-04-17-prod-stats.json
new file mode 100644
index 00000000..ddf7fca1
--- /dev/null
+++ b/extra/stats/2020-04-17-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":4460684,"timestamp":"2020-04-17T18:03:34.373631+00:00"}},"container":{"total":149527},"papers":{"in_kbart":60679890,"in_web":24250766,"in_web_not_kbart":11970984,"is_oa":15538739,"total":108761510},"release":{"refs_total":914708032,"total":148081134}}
diff --git a/extra/stats/2020-04-17-prod-table-sizes.txt b/extra/stats/2020-04-17-prod-table-sizes.txt
new file mode 100644
index 00000000..79aa3b98
--- /dev/null
+++ b/extra/stats/2020-04-17-prod-table-sizes.txt
@@ -0,0 +1,46 @@
+Size: 591.60G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 55 GB | 45 GB | 100 GB
+ "public"."release_rev" | 60 GB | 34 GB | 94 GB
+ "public"."refs_blob" | 87 GB | 2885 MB | 89 GB
+ "public"."file_rev" | 26 GB | 29 GB | 55 GB
+ "public"."release_edit" | 14 GB | 21 GB | 35 GB
+ "public"."work_edit" | 14 GB | 21 GB | 34 GB
+ "public"."file_edit" | 11 GB | 16 GB | 27 GB
+ "public"."release_ident" | 9821 MB | 15 GB | 24 GB
+ "public"."work_ident" | 9596 MB | 15 GB | 24 GB
+ "public"."file_rev_url" | 15 GB | 6040 MB | 21 GB
+ "public"."abstracts" | 18 GB | 1688 MB | 19 GB
+ "public"."file_ident" | 6694 MB | 10219 MB | 17 GB
+ "public"."file_rev_release" | 6267 MB | 10109 MB | 16 GB
+ "public"."work_rev" | 6233 MB | 5825 MB | 12 GB
+ "public"."release_ref" | 4441 MB | 6322 MB | 11 GB
+ "public"."release_rev_abstract" | 2637 MB | 3505 MB | 6141 MB
+ "public"."creator_edit" | 702 MB | 942 MB | 1643 MB
+ "public"."editgroup" | 980 MB | 502 MB | 1482 MB
+ "public"."creator_rev" | 695 MB | 719 MB | 1413 MB
+ "public"."creator_ident" | 474 MB | 648 MB | 1121 MB
+ "public"."changelog" | 289 MB | 239 MB | 527 MB
+ "public"."release_rev_extid" | 206 MB | 320 MB | 526 MB
+ "public"."container_rev" | 75 MB | 23 MB | 98 MB
+ "public"."container_edit" | 25 MB | 32 MB | 57 MB
+ "public"."container_ident" | 11 MB | 19 MB | 30 MB
+ "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."auth_oidc" | 16 kB | 48 kB | 64 kB
+ "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB
+ "public"."editor" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md
index 172528da..be53d10c 100644
--- a/notes/bulk_edits/CHANGELOG.md
+++ b/notes/bulk_edits/CHANGELOG.md
@@ -9,6 +9,13 @@ this file should probably get merged into the guide at some point.
This file should not turn in to a TODO list!
+## 2020-03
+
+Started harvesting both Arxiv and Pubmed metadata daily and importing to
+fatcat. Did backfill imports for both sources.
+
+JALC DOI register update from 2019 dump.
+
## 2020-01
Imported around 2,500 new containers (journals, by ISSN-L) from chocula
@@ -21,6 +28,8 @@ Imported new release entities from 2020 Pubmed/MEDLINE baseline. This import
included only new Pubmed works cataloged in 2019 (up until December or so).
Only a few hundred thousand new release entities.
+Daily "ingest" (crawling) pipeline running.
+
## 2019-12
Started continuous harvesting Datacite DOI metadata; first date harvested was
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index d1e7c2db..3a49f86e 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -153,28 +153,33 @@ class EntityUpdatesWorker(FatcatWorker):
doi = ingest_request.get('ext_ids', {}).get('doi')
is_document = release.release_type in (
- 'article-journal',
- 'paper-conference',
'article',
- 'report',
+ 'article-journal',
+ 'article-newspaper',
+ 'book',
'chapter',
- 'manuscript',
- 'review',
- 'thesis',
- 'letter',
'editorial',
- 'abstract',
- 'entry',
+ 'interview',
+ 'legal_case',
+ 'legislation',
+ 'letter',
+ 'manuscript',
+ 'paper-conference',
'patent',
- 'post',
+ 'peer_review',
+ 'report',
+ 'retraction',
+ 'review',
'review-book',
+ 'thesis',
)
is_not_pdf = release.release_type in (
+ 'component',
'dataset',
- 'stub',
- 'software',
'figure',
'graphic',
+ 'software',
+ 'stub',
)
# accept list sets a default "crawl it" despite OA metadata for