aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README.md60
-rw-r--r--TODO2
-rw-r--r--extra/RUNBOOK.md (renamed from RUNBOOK.md)4
-rw-r--r--extra/blobs/README.md (renamed from blobs/README.md)0
-rw-r--r--extra/blobs/minio/README.md (renamed from blobs/minio/README.md)0
-rw-r--r--extra/blobs/minio/minio.conf (renamed from blobs/minio/minio.conf)0
-rw-r--r--extra/blobs/seaweedfs/README.md (renamed from blobs/seaweedfs/README.md)0
-rw-r--r--extra/blobs/tasks.md (renamed from blobs/tasks.md)4
-rw-r--r--extra/hbase/howto.md (renamed from hbase/howto.md)0
-rw-r--r--extra/hbase/notes.txt (renamed from hbase/notes.txt)0
-rw-r--r--extra/hbase/schema_design.md (renamed from hbase/schema_design.md)0
-rw-r--r--extra/nginx/README.md (renamed from nginx/README.md)0
-rw-r--r--extra/nginx/fatcat-blobs (renamed from nginx/fatcat-blobs)0
-rw-r--r--extra/nginx/sandcrawler-db (renamed from nginx/sandcrawler-db)0
-rw-r--r--extra/nginx/sandcrawler-minio (renamed from nginx/sandcrawler-minio)0
-rw-r--r--notes/dryad_datasets.md17
-rw-r--r--notes/examples/2021-11-12_broken_grobid_xml.md83
-rw-r--r--notes/examples/dataset_examples.txt52
-rw-r--r--notes/examples/html_test_journals.txt153
-rw-r--r--notes/examples/random_datasets.md19
-rw-r--r--notes/ingest/2021-09-02_oai_pmh_patch.md4
-rw-r--r--notes/ingest/2021-09-03_patch_crawl.md196
-rw-r--r--notes/ingest/2021-12-13_datasets.md504
-rw-r--r--notes/ingest/2022-01-06_patch_crawl.md398
-rw-r--r--notes/ingest/2022-01-13_doi_crawl.md248
-rw-r--r--notes/ingest/2022-03_doaj.md278
-rw-r--r--notes/ingest/2022-03_oaipmh.md40
-rw-r--r--notes/ingest/2022-04_targeted.md144
-rw-r--r--notes/ingest/2022-04_unpaywall.md278
-rw-r--r--notes/ingest/2022-07-15_ingest_fixes.md831
-rw-r--r--notes/ingest/2022-07-19_dblp.md50
-rw-r--r--notes/ingest/2022-07_doaj.md199
-rw-r--r--notes/ingest/2022-07_targeted.md140
-rw-r--r--notes/ingest/2022-09_oaipmh.md397
-rw-r--r--notes/ingest_domains.txt294
-rw-r--r--notes/possible_ingest_targets.txt15
-rw-r--r--notes/tasks/2021-09-09_pdf_url_lists.md4
-rw-r--r--notes/tasks/2021-12-06_regrobid.md91
-rw-r--r--notes/tasks/2022-01-07_grobid_platform_pdfs.md23
-rw-r--r--notes/tasks/2022-03-07_ukraine_firedrill.md225
-rw-r--r--notes/tasks/2022-04-27_pdf_url_lists.md72
-rw-r--r--notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md132
-rwxr-xr-xplease2
-rw-r--r--proposals/2018_original_sandcrawler_rfc.md (renamed from sandcrawler-rfc.md)0
-rw-r--r--proposals/2019_ingest.md2
-rw-r--r--proposals/20200129_pdf_ingest.md2
-rw-r--r--proposals/20200207_pdftrio.md5
-rw-r--r--proposals/20201012_no_capture.md5
-rw-r--r--proposals/20201103_xml_ingest.md19
-rw-r--r--proposals/2020_pdf_meta_thumbnails.md2
-rw-r--r--proposals/2021-04-22_crossref_db.md2
-rw-r--r--proposals/2021-12-09_trawling.md180
-rw-r--r--proposals/brainstorm/2021-debug_web_interface.md9
-rw-r--r--proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md36
-rw-r--r--python/.flake83
-rw-r--r--python/Pipfile14
-rw-r--r--python/Pipfile.lock2309
-rw-r--r--python/README.md46
-rw-r--r--python/TODO7
-rwxr-xr-xpython/grobid_tool.py2
-rwxr-xr-xpython/ingest_tool.py63
-rwxr-xr-xpython/pdftrio_tool.py2
-rw-r--r--python/pytest.ini2
-rw-r--r--python/sandcrawler/__init__.py1
-rw-r--r--python/sandcrawler/fileset_platforms.py20
-rw-r--r--python/sandcrawler/fileset_strategies.py55
-rw-r--r--python/sandcrawler/grobid.py9
-rw-r--r--python/sandcrawler/html.py51
-rw-r--r--python/sandcrawler/html_metadata.py210
-rw-r--r--python/sandcrawler/ia.py159
-rw-r--r--python/sandcrawler/ingest_file.py48
-rw-r--r--python/sandcrawler/ingest_fileset.py40
-rw-r--r--python/sandcrawler/ingest_html.py18
-rw-r--r--python/sandcrawler/minio.py2
-rw-r--r--python/sandcrawler/misc.py4
-rw-r--r--python/sandcrawler/pdfextract.py20
-rw-r--r--python/sandcrawler/pdftrio.py4
-rw-r--r--python/sandcrawler/persist.py4
-rw-r--r--python/sandcrawler/workers.py4
-rwxr-xr-xpython/sandcrawler_worker.py29
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py10
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py8
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py10
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py1
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py9
-rwxr-xr-xpython/scripts/oai2ingestrequest.py40
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py3
-rw-r--r--python/tests/test_html.py25
-rw-r--r--python/tests/test_ingest.py26
-rw-r--r--python/tests/test_savepagenow.py85
-rw-r--r--python_hadoop/README.md8
-rw-r--r--sql/Makefile35
-rw-r--r--sql/dump_reingest_bulk.sql31
-rw-r--r--sql/dump_reingest_old.sql36
-rw-r--r--sql/dump_reingest_quarterly.sql32
-rw-r--r--sql/dump_reingest_spn.sql25
-rw-r--r--sql/dump_reingest_terminalstatus.sql34
-rw-r--r--sql/dump_reingest_weekly.sql27
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql1
-rwxr-xr-xsql/reingest_bulk.sh19
-rwxr-xr-xsql/reingest_old.sh19
-rwxr-xr-xsql/reingest_quarterly.sh2
-rwxr-xr-xsql/reingest_spn.sh2
-rwxr-xr-xsql/reingest_terminalstatus_forcerecrawl.sh19
-rwxr-xr-xsql/reingest_weekly.sh4
-rw-r--r--sql/stats/2022-04-26_stats.txt432
-rw-r--r--sql/stats/2022-04-27_crawl_changelog.txt191
-rw-r--r--sql/stats/2022-05-11_crawl_changelog.txt410
-rw-r--r--sql/stats/2022-09-06_stats.txt438
-rw-r--r--sql/stats/2022-11-23_table_sizes.txt21
110 files changed, 8933 insertions, 1417 deletions
diff --git a/README.md b/README.md
index a0eaa98..b29e397 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,25 @@
\ooooooo| |___/\__,_|_| |_|\__,_|\___|_| \__,_| \_/\_/ |_|\___|_|
-This repo contains back-end python workers, scripts, hadoop jobs, luigi tasks,
-and other scripts and code for the Internet Archive web group's journal ingest
-pipeline. This code is of mixed quality and is mostly experimental. The goal
-for most of this is to submit metadata to [fatcat](https://fatcat.wiki), which
-is the more stable, maintained, and public-facing service.
-
-Code in this repository is potentially public! Not intended to accept public
-contributions for the most part. Much of this will not work outside the IA
-cluster environment.
+This repo contains back-end python workers, scripts, config files, and other
+stuff related to the Internet Archive web group's scholarly web preservation
+and processing pipeline. It is a complement to [fatcat](https://fatcat.wiki),
+which is an open catalog of research outputs, including preservation metadata.
+
+The sandcrawler part of the project deals with content crawled from the web
+into either web.archive.org or archive.org collections, and post-processing
+that content. For example, extracting text from PDF files, verifying mimetypes,
+and checking archival status. The resulting metadata ends up getting filtered,
+transformed, and pushed in to fatcat itself for public use.
+
+While code in this repository is public, it is mostly IA-specific and may not
+even run outside the IA data centers due to library dependencies and
+authentication needs. Code quality and documentation is generally poor compared
+to fatcat.
+
+As of December 2022, the best document to read for "getting started" in
+understanding the ingest system is `proposals/2019_ingest.md`, and then
+subsequent proposals expanding on that foundation.
Archive-specific deployment/production guides and ansible scripts at:
[journal-infra](https://git.archive.org/webgroup/journal-infra)
@@ -22,33 +32,35 @@ Archive-specific deployment/production guides and ansible scripts at:
## Repository Layout
-**./proposals/** design documentation and change proposals
-
**./python/** contains scripts and utilities for ingesting content from wayback
-and/or the web (via save-page-now API), and other processing pipelines
+and/or the web (via save-page-now API), and other processing pipelines. Most of
+the active code is in here. See included README (`./python/README.md`)
**./sql/** contains schema, queries, and backfill scripts for a Postgres SQL
database index (eg, file metadata, CDX, and GROBID status tables).
-**./pig/** contains a handful of Pig scripts, as well as some unittests
-implemented in python. Only rarely used.
+**./python_hadoop/** contains Hadoop streaming jobs written in python using the
+`mrjob` library. Still use the HBase backfill code path occasionally.
-**./scalding/** contains Hadoop jobs written in Scala using the Scalding
-framework. The intent is to write new non-trivial Hadoop jobs in Scala, which
-brings type safety and compiled performance. Mostly DEPRECATED.
+**./proposals/** design documentation and change proposals
-**./python_hadoop/** contains Hadoop streaming jobs written in python using the
-`mrjob` library. Mostly DEPRECATED.
+**./notes/ingest/** log of bulk crawls and metadata loads
+**./extra/docker/** docker-compose setup that may be useful for documentation
+(includes Kafka, PostgreSQL, etc)
-## Running Python Code
+**./.gitlab-ci.yml** current CI setup script, which documents dependencies
-You need python3.8 (or python3.6+ and `pyenv`) and `pipenv` to set up the
-environment. You may also need the debian packages `libpq-dev` and `
-`python-dev` to install some dependencies.
+**./pig/** contains a handful of Pig scripts, as well as some unittests
+implemented in python. Only rarely used.
+
+**./scalding/** contains Hadoop jobs written in Scala using the Scalding
+framework. The intent is to write new non-trivial Hadoop jobs in Scala, which
+brings type safety and compiled performance. Mostly DEPRECATED, this code has
+not been run in years.
-## Running Hadoop Jobs (DEPRECATED)
+## Running Python Hadoop Jobs
The `./please` python3 wrapper script is a helper for running jobs (python or
scalding) on the IA Hadoop cluster. You'll need to run the setup/dependency
diff --git a/TODO b/TODO
index 77b48c9..33dc147 100644
--- a/TODO
+++ b/TODO
@@ -1,4 +1,6 @@
+Note: as of 2022 this file is ancient and need review
+
## Kafka Pipelines
- after network split, mass restarting import/harvest stuff seemed to
diff --git a/RUNBOOK.md b/extra/RUNBOOK.md
index 33d4711..6c4165d 100644
--- a/RUNBOOK.md
+++ b/extra/RUNBOOK.md
@@ -23,7 +23,7 @@ Copy/transfer to a Kafka node; load a sample and then the whole output:
Older example; if this fails, need to re-run entire thing:
- cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
TODO: is it possible to use job log with millions of `--pipe` inputs? That
would be more efficient in the event of failure.
@@ -35,7 +35,7 @@ Want to use GNU/Parallel in a mode that will do retries well:
fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
sort | \
parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
- './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
After starting, check that messages are actually getting pushed to kafka
(producer failures can be silent!). If anything goes wrong, run the exact same
diff --git a/blobs/README.md b/extra/blobs/README.md
index 555db92..555db92 100644
--- a/blobs/README.md
+++ b/extra/blobs/README.md
diff --git a/blobs/minio/README.md b/extra/blobs/minio/README.md
index d8f1c69..d8f1c69 100644
--- a/blobs/minio/README.md
+++ b/extra/blobs/minio/README.md
diff --git a/blobs/minio/minio.conf b/extra/blobs/minio/minio.conf
index 2e93f9a..2e93f9a 100644
--- a/blobs/minio/minio.conf
+++ b/extra/blobs/minio/minio.conf
diff --git a/blobs/seaweedfs/README.md b/extra/blobs/seaweedfs/README.md
index d19e9e0..d19e9e0 100644
--- a/blobs/seaweedfs/README.md
+++ b/extra/blobs/seaweedfs/README.md
diff --git a/blobs/tasks.md b/extra/blobs/tasks.md
index 34dec8f..beb765f 100644
--- a/blobs/tasks.md
+++ b/extra/blobs/tasks.md
@@ -19,7 +19,7 @@ didn't try to connect to postgresql.
Commands:
- ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
=> Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed
=> run briefly, then kill
@@ -29,7 +29,7 @@ On kafka-broker worker:
Then run 2x instances of worker (same command as above):
- ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
At this point CPU-limited on this worker by the python processes (only 4 cores
on this machine).
diff --git a/hbase/howto.md b/extra/hbase/howto.md
index 26d33f4..26d33f4 100644
--- a/hbase/howto.md
+++ b/extra/hbase/howto.md
diff --git a/hbase/notes.txt b/extra/hbase/notes.txt
index 20f406f..20f406f 100644
--- a/hbase/notes.txt
+++ b/extra/hbase/notes.txt
diff --git a/hbase/schema_design.md b/extra/hbase/schema_design.md
index 2db8998..2db8998 100644
--- a/hbase/schema_design.md
+++ b/extra/hbase/schema_design.md
diff --git a/nginx/README.md b/extra/nginx/README.md
index 0369f9b..0369f9b 100644
--- a/nginx/README.md
+++ b/extra/nginx/README.md
diff --git a/nginx/fatcat-blobs b/extra/nginx/fatcat-blobs
index 5c692ef..5c692ef 100644
--- a/nginx/fatcat-blobs
+++ b/extra/nginx/fatcat-blobs
diff --git a/nginx/sandcrawler-db b/extra/nginx/sandcrawler-db
index 67d1a2d..67d1a2d 100644
--- a/nginx/sandcrawler-db
+++ b/extra/nginx/sandcrawler-db
diff --git a/nginx/sandcrawler-minio b/extra/nginx/sandcrawler-minio
index 2e9bfe3..2e9bfe3 100644
--- a/nginx/sandcrawler-minio
+++ b/extra/nginx/sandcrawler-minio
diff --git a/notes/dryad_datasets.md b/notes/dryad_datasets.md
new file mode 100644
index 0000000..5c727b1
--- /dev/null
+++ b/notes/dryad_datasets.md
@@ -0,0 +1,17 @@
+
+api docs: https://datadryad.org/api/v2/docs
+
+current search queries return 38,000 hits (December 2020)
+
+exmaple with multiple versions:
+ https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0
+ https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0
+ https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions
+
+
+how to handle versions? DOI doesn't get incremented.
+
+on archive.org, could have separate item for each version, or sub-directories within item, one for each version
+
+in fatcat, could have a release for each version, but only one with
+the DOI; or could have a separate fileset for each version
diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md
new file mode 100644
index 0000000..5223651
--- /dev/null
+++ b/notes/examples/2021-11-12_broken_grobid_xml.md
@@ -0,0 +1,83 @@
+
+Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others):
+
+ sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100;
+
+ sha1hex | updated | grobid_version | status_code | status | fatcat_release | metadata
+ ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------
+ d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 | | 200 | error | | {"error_msg": "response XML too large: 12052192 bytes"}
+ 8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 | | 200 | error | | {"error_msg": "response XML too large: 18758248 bytes"}
+ 227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf
+ FIXED
+ f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 527"}
+ https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf
+ FIXED
+ c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 | | 200 | bad-grobid-xml | | {"error_msg": "mismatched tag: line 198, column 3"}
+ https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf
+ FIXED (and good)
+ 4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 | | 200 | bad-grobid-xml | | {"error_msg": "unclosed token: line 812, column 7"}
+ https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf
+ FIXED
+ metadata quality mixed, but complex document (?)
+ 7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 38, column 440"}
+ https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23
+ FIXED
+ 088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 47, column 814"}
+ https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf
+ FIXED
+ 19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 853, column 84"}
+ not found
+ acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 60, column 45"}
+ https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf
+ BROKEN: not well-formed (invalid token): line 60, column 45
+ <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note>
+ 8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 44, column 45"}
+ not found
+ c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 58, column 45"}
+ https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308
+ BROKEN: not well-formed (invalid token): line 58, column 45
+ <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, &amp; Bian, 2020).</note>
+ 840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 1824, column 45"}
+ not found
+ 3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 65, column 45"}
+ not found
+ f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 29, column 1581"}
+ https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649
+ FIXED, good
+ f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf
+ FIXED
+ 37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 1284"}
+ https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf
+ FIXED
+ 3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1
+ FIXED
+ (21 rows)
+
+Some other errors from other queries:
+
+ d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"}
+ https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf
+ FIXED: with 0.7.0+
+
+ 56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 | | 500 | error | | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"}
+ https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf
+ still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500
+ BAD PDF ("no pages" in evince)
+
+ d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"}
+ https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf
+ FIXED
+
+ 51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00 | | 503 | error | | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t
+ https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf
+ FIXED
+
+In summary, there are still a small number of `bad-grobid-xml` cases, and still
+many "very large PDF" cases. But we should probably broadly retry everything,
+especially the 503 errors (from when GROBID is simply down/unavailable).
+
+The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations,
+which I have submitted a patch/PR for.
diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt
new file mode 100644
index 0000000..3a04750
--- /dev/null
+++ b/notes/examples/dataset_examples.txt
@@ -0,0 +1,52 @@
+
+### ArchiveOrg: CAT dataset
+
+<https://archive.org/details/CAT_DATASET>
+
+`release_36vy7s5gtba67fmyxlmijpsaui`
+
+###
+
+<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635>
+
+doi:10.1371/journal.pone.0120448
+
+Single .rar file
+
+### Dataverse
+
+<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B>
+
+Single excel file
+
+### Dataverse
+
+<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+doi:10.7910/DVN/CLSFKX
+
+Mulitple files; multiple versions?
+
+API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+ .data.id
+ .data.latestVersion.datasetPersistentId
+ .data.latestVersion.versionNumber, .versionMinorNumber
+ .data.latestVersion.files[]
+ .dataFile
+ .contentType (mimetype)
+ .filename
+ .filesize (int, bytes)
+ .md5
+ .persistendId
+ .description
+ .label (filename?)
+ .version
+
+Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB>
+
+Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3)
+
+Dataverse refs:
+- 'doi' and 'hdl' are the two persistentId styles
+- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled
diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt
new file mode 100644
index 0000000..540dc9f
--- /dev/null
+++ b/notes/examples/html_test_journals.txt
@@ -0,0 +1,153 @@
+
+Good examples of journals to run HTML fulltext extraction on.
+
+## Live Web
+
+d-lib magazine
+ live web
+ no longer active
+ http://www.dlib.org/back.html
+
+NLM technical bulletin
+ https://www.nlm.nih.gov/pubs/techbull/back_issues.html
+
+Genders
+ https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html
+
+firstmondays
+ live web; now OJS
+
+outhistory.org
+
+http://journal.sjdm.org/
+
+http://whoosh.org/
+
+
+## Vanished (but wayback coverage)
+
+ohmylittledata
+ issn:2551-1289
+ vanished
+ blog format
+ http://web.archive.org/web/20180421061156/https://ohmylittledata.com/
+
+exquisit corpse
+ https://web.archive.org/web/20080521052400/http://corpse.org:80/
+
+Journal of Mundane Behavior
+ https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya
+ ISSN: 1529-3041
+
+ defunct since ~2010
+ simple HTML articles
+ references
+ http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm
+ http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm
+
+War Crimes
+
+ PDF articles (not HTML)
+ http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/
+
+
+## DOAJ Test Articles (HTML)
+
+ zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt
+ => 2,184,954
+
+ cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25
+ 254817 link.springer.com
+ 145159 www.scielo.br
+ 78044 journal.frontiersin.org
+ 77394 www.frontiersin.org
+ 40849 www.dovepress.com
+ 19024 dergipark.org.tr
+ 18758 periodicos.ufsc.br
+ 16346 www.revistas.usp.br
+ 15872 revistas.unal.edu.co
+ 15527 revistas.ucm.es
+ 13669 revistas.usal.es
+ 12640 dergipark.gov.tr
+ 12111 journals.rudn.ru
+ 11839 www.scielosp.org
+ 11277 www.karger.com
+ 10827 www.journals.vu.lt
+ 10318
+ 9854 peerj.com
+ 9100 ojs.unud.ac.id
+ 8581 jurnal.ugm.ac.id
+ 8261 riviste.unimi.it
+ 8012 journals.uran.ua
+ 7454 revistas.pucp.edu.pe
+ 7264 journals.vgtu.lt
+ 7200 publicaciones.banrepcultural.org
+
+ cat html_fulltext_urls.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.filtered.txt
+ => 1,579,257
+
+ zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt
+ => 560k
+
+ cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25
+ 40849 www.dovepress.com
+ 10570 journals.rudn.ru
+ 10494 dergipark.org.tr
+ 10233 revistas.unal.edu.co
+ 9981 dergipark.gov.tr
+ 9428 revistas.usal.es
+ 8292 revistas.ucm.es
+ 7200 publicaciones.banrepcultural.org
+ 6953 revistas.pucp.edu.pe
+ 6000 www.scielosp.org
+ 5962 www.scielo.br
+ 5621 www.richtmann.org
+ 5123 scielo.sld.cu
+ 5067 ojs.unud.ac.id
+ 4838 periodicos.ufsc.br
+ 4736 revistasonlinepre.inap.es
+ 4486 journal.fi
+ 4221 www.seer.ufu.br
+ 3553 revistas.uam.es
+ 3492 revistas.pucsp.br
+ 3060 www.scielo.org.co
+ 2991 scielo.isciii.es
+ 2802 seer.ufrgs.br
+ 2692 revistas.unc.edu.ar
+ 2685 srl.si
+
+ cat html_fulltext_urls.no_doi.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.no_doi.filtered.txt
+ => 518,608
+
+ zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20
+ https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795
+ https://journal.umy.ac.id/index.php/st/article/view/3297
+ https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442
+ http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf
+ http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440
+ https://journal.fi/inf/article/view/59430
+ http://journal.uii.ac.id/index.php/Eksakta/article/view/2429
+ https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS
+ https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157
+ http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce
+ http://revistas.pucp.edu.pe/index.php/themis/article/view/11862
+ http://journal.bdfish.org/index.php/fisheries/article/view/91
+ https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567
+ https://www.lithosphere.ru/jour/article/view/779
+ https://journals.hioa.no/index.php/seminar/article/view/2412
+ http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197
+ https://www.kmuj.kmu.edu.pk/article/view/15698
+ http://forodeeducacion.com/ojs/index.php/fde/article/view/82
+ https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941
+ http://grbs.library.duke.edu/article/view/3361
+
diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md
new file mode 100644
index 0000000..b69132c
--- /dev/null
+++ b/notes/examples/random_datasets.md
@@ -0,0 +1,19 @@
+
+Possible external datasets to ingest (which are not entire platforms):
+
+- https://research.google/tools/datasets/
+- https://openslr.org/index.html
+- https://www.kaggle.com/datasets?sort=votes&tasks=true
+- https://archive.ics.uci.edu/ml/datasets.php
+
+Existing archive.org datasets to ingest:
+
+- https://archive.org/details/allthemusicllc-datasets
+
+Papers on archive.org to ingest:
+
+- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=>
+- <https://archive.org/details/biorxiv>
+- <https://archive.org/details/philosophicaltransactions?tab=collection>
+- <https://archive.org/search.php?query=doi%3A%2A>
+- <https://archive.org/details/folkscanomy_academic>
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
index fded7b3..ac808dd 100644
--- a/notes/ingest/2021-09-02_oai_pmh_patch.md
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -1506,8 +1506,8 @@ possible to detect these at ingest time, or earlier at OAI-PMH
harvest/transform time and filter them out.
It may be worthwhile to attempt ingest of multiple existing captures
-(timestamps) in the ingest pipeline. Eg, isntead of chosing a single "best"
-capture, if therea are multiple HTTP 200 status captures, try ingest with each
+(timestamps) in the ingest pipeline. Eg, instead of chosing a single "best"
+capture, if there are multiple HTTP 200 status captures, try ingest with each
(or at least a couple). This is because repository software gets upgraded, so
old "no-capture" or "not found" or "link loop" type captures may work when
recrawled.
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md
index f63e524..d36f427 100644
--- a/notes/ingest/2021-09-03_patch_crawl.md
+++ b/notes/ingest/2021-09-03_patch_crawl.md
@@ -482,7 +482,197 @@ Note that this is just seedlists, not full ingest requests.
Then run the actual patch crawl!
-## Ingest Requests for Bulk Retry
+## Ingest Requests for Bulk Retry (2022-01-06)
+
+Crawl has just about completed, so running another round of bulk ingest
+requests, slightly updated to allow `https://doi.org/10*` in terminal URL:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.updated <= '2022-01-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json';
+ => 4,488,193
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json
+ => DONE
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => TIMEDOUT
+ => (probably due to re-assignment)
+ => DONE
+
+## Stats Again (just OAI-PMH)
+
+OAI-PMH query:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+On 2022-02-08:
+
+ status | count
+ -----------------------+----------
+ success | 13505143
+ no-pdf-link | 8741007
+ no-capture | 4429986
+ redirect-loop | 1566611
+ terminal-bad-status | 816162
+ link-loop | 459006
+ wrong-mimetype | 448983
+ null-body | 71871
+ cdx-error | 19055
+ | 15275
+ petabox-error | 11713
+ blocked-cookie | 11664
+ wayback-error | 8745
+ skip-url-blocklist | 7828
+ max-hops-exceeded | 2031
+ wayback-content-error | 338
+ body-too-large | 280
+ spn2-error:job-failed | 191
+ bad-redirect | 134
+ redirects-exceeded | 120
+ (20 rows)
+
+
+On 2022-02-28, after bulk ingest completed:
+
+ status | count
+ -----------------------+----------
+ success | 14668123
+ no-pdf-link | 8822460
+ no-capture | 2987565
+ redirect-loop | 1629015
+ terminal-bad-status | 917851
+ wrong-mimetype | 466512
+ link-loop | 460941
+ null-body | 71457
+ cdx-error | 19636
+ petabox-error | 16198
+ | 15275
+ blocked-cookie | 11885
+ wayback-error | 8779
+ skip-url-blocklist | 7838
+ empty-blob | 5906
+ max-hops-exceeded | 5563
+ wayback-content-error | 355
+ body-too-large | 329
+ spn2-error:job-failed | 191
+ bad-redirect | 137
+ (20 rows)
+
+
+Comparing to a couple months ago:
+
+ 14668123-13258356 = +1,409,767 success
+ 8822460-8685519 = + 136,941 no-pdf-link
+ 2987565-4765663 = -1,778,098 no-capture
+ 917851-803373 = + 114,478 terminal-bad-status
-TODO: for each of the link sources mentioned at top, do a separate query by
-source to re-ingest.
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md
new file mode 100644
index 0000000..786c3b2
--- /dev/null
+++ b/notes/ingest/2021-12-13_datasets.md
@@ -0,0 +1,504 @@
+
+First round of production dataset ingest. Aiming to get one or two small
+repositories entirely covered, and a few thousand datasets from all supported
+platforms.
+
+Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up
+to a TByte of content locally (on spinning disk). For successful output, will
+run through fatcat import; for a subset of unsuccessful, will start a small
+heritrix crawl.
+
+
+## Ingest Generation
+
+Summary:
+
+ wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json
+ 2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+ 1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ 2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+All the below ingest requests were combined into a single large file:
+
+ cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz
+ # 24.7k 0:00:00 [91.9k/s]
+
+### Figshare
+
+- sample 10k datasets (not other types)
+- want only "versioned" DOIs; use regex on DOI to ensure
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \
+ | rg '10\.6084/m9\.figshare\.\d+.v\d+' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000})
+
+### Zenodo
+
+- has DOIs (of course)
+- want only "versioned" DOIs? how to skip?
+- sample 10k
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \
+ | rg '10\.5281/zenodo' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+### Goettingen Research Online
+
+- <https://data.goettingen-research-online.de/>
+- Dataverse instance, not harvard-hosted
+- ~1,400 datasets, ~10,500 files
+- has DOIs
+- `doi_prefix:10.25625`, then filter to only one slash
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \
+ | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \
+ | shuf \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739}) # 1.7k 0:01:29 [ 19 /s]
+
+### Harvard Dataverse
+
+- main harvard dataverse instance, many "sub-dataverses"
+- ~137,000 datasets, ~1,400,000 files
+- 10k sample
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \
+ | rg '10\.7910/dvn/[a-z0-9]{6}' \
+ | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000}) # 2.97k 0:03:26 [14.4 /s]
+
+Note that this was fewer than expected, but moving on anyways.
+
+### archive.org
+
+A couple hand-filtered items.
+
+"CAT" dataset
+- item: <https://archive.org/details/CAT_DATASET>
+- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui`
+
+"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing"
+- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62
+- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper)
+
+
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/CAT_DATASET",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "36vy7s5gtba67fmyxlmijpsaui",
+ "work_ident": "ycqtbhnfmzamheq2amztiwbsri"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "36vy7s5gtba67fmyxlmijpsaui"
+ }
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu",
+ "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu"
+ }
+
+ # paste and then Ctrl-D:
+ cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+
+
+## Ingest Command
+
+On `wbgrp-svc263`.
+
+In the current version of tool, `skip_cleanup_local_files=True` by default, so
+files will stick around.
+
+Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output.
+
+
+ # first a small sample
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | head -n5 \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json
+
+ # ok, run the whole batch through
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json
+
+Got an error:
+
+ internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`?
+
+Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work:
+
+ AttributeError: 'ArchiveSession' object has no attribute 'upload'
+
+Going to hack with config in homedir for now.
+
+Extract URLs for crawling:
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt
+
+### Exceptions Encountered
+
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process
+ internetarchive.upload
+ [...]
+ ConnectionResetError: [Errno 104] Connection reset by peer
+ urllib3.exceptions.ProtocolError
+ requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5')
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process
+ r.raise_for_status()
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status
+ raise HTTPError(http_error_msg, response=self)
+ requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201
+
+download sometimes just slowly time out, like after a day or more
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path
+ mimetype = magic.Magic(mime=True).from_file(path)
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file
+ with _real_open(filename):
+ FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz'
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request
+ obj_latest = obj["data"]["latestVersion"]
+ KeyError: 'latestVersion'
+
+Fixed the above, trying again:
+
+ git log | head -n1
+ # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c
+
+ Wed Dec 15 21:57:42 UTC 2021
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json
+
+Zenodo seems really slow, let's try filtering those out:
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json
+ # 3.76k 15:12:53 [68.7m/s]
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json
+
+## Fatcat Import
+
+ wc -l ingest_dataset_combined_results*.json
+ 126 ingest_dataset_combined_results2.json
+ 153 ingest_dataset_combined_results3.json
+ 275 ingest_dataset_combined_results4.json
+ 3762 ingest_dataset_combined_results5.json
+ 7736 ingest_dataset_combined_results6.json
+ 182 ingest_dataset_combined_results.json
+ 5 ingest_dataset_combined_results.ramp.json
+ 12239 total
+
+ cat ingest_dataset_combined_results*.json \
+ | rg '^\{' \
+ | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \
+ | sort \
+ | uniq --check-chars 26 \
+ | cut -f2 \
+ | rg -v '\\\\' \
+ | pv -l \
+ > uniq_ingest_dataset_combined_results.json
+ # 9.48k 0:00:06 [1.54k/s]
+
+ cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr
+ 7941 no-capture
+ 374 platform-404
+ 369 terminal-bad-status
+ 348 success-file
+ 172 success
+ 79 platform-scope
+ 77 error-platform-download
+ 47 empty-manifest
+ 27 platform-restricted
+ 20 too-many-files
+ 12 redirect-loop
+ 6 error-archiveorg-upload
+ 3 too-large-size
+ 3 mismatch
+ 1 no-platform-match
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success") | .' -c \
+ > uniq_ingest_dataset_combined_results.success.json
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success-file") | .' -c \
+ > uniq_ingest_dataset_combined_results.success-file.json
+
+On fatcat QA instance:
+
+ git log | head -n1
+ # commit cca680e2cc4768a4d45e199f6256a433b25b4075
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+Need to update fatcat file worker to support single-file filesets... was that the plan?
+
+ head /tmp/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0})
+
+Trying again 2022-03-23:
+
+ git log | head -n1
+ # commit 134cb050988be2c545af89e0a67c4998307bb819
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0})
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0})
+
+Fixed a small logic error in insert path.
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0})
+
+archive.org datasets are *not* getting uploaded with the correct path. path
+directory prefixes are getting clobbered.
+
+## Summary
+
+As follow-up, it may be worth doing another manual round of ingest requests.
+After that, would be good to fill in "glue" code so that this can be done with
+kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can
+start scaling up more ingest, using ingest tool, "bulk mode" processing,
+heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest
+process.
+
+For scaling, let's do a "full" ingest request generation of all datasets, and
+crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens
+of millions of mostly DOIs (doi.org URLs), should crawl quickly.
+
+Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio.
+uploading large datasets to archive.org, but not doing SPN web requests. Feed
+the resulting huge file seedlist into a heritrix crawl to download web files.
+
+Will need to add support for more specific platforms.
+
+
+### Huge Bulk Ingest Prep
+
+On prod instance:
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz
+ # Expecting 11264787 release objects in search queries
+ # TIMEOUT ERROR
+ # 6.07M 19:13:02 [87.7 /s] (partial)
+
+As follow-up, should do a full batch (not partial). For now search index is too
+unreliable (read timeouts).
+
+ zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \
+ | jq .base_url -r \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > ingest_dataset_bulk.2022-01-05.partial.schedule
+
+## Retries (2022-01-12)
+
+This is after having done a bunch of crawling.
+
+ cat ingest_dataset_combined_results6.json \
+ | rg '"no-capture"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request -c \
+ | pv -l \
+ > ingest_dataset_retry.json
+ => 6.51k 0:00:01 [3.55k/s]
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json
+
+## Retries (2022-02)
+
+Finally got things to complete end to end for this batch!
+
+ cat ingest_dataset_retry_results5.json | jq .status -r | sort | uniq -c | sort -nr
+ 3220 terminal-bad-status
+ 2120 no-capture
+ 380 empty-manifest
+ 264 success-file
+ 251 success
+ 126 success-existing
+ 39 mismatch
+ 28 error-platform-download
+ 24 too-many-files
+ 20 platform-scope
+ 13 platform-restricted
+ 13 mismatch-size
+ 6 too-large-size
+ 3 transfer-encoding-error
+ 2 no-platform-match
+ 2 error-archiveorg-upload
+ 1 redirect-loop
+ 1 empty-blob
+
+Some more URLs to crawl:
+
+ cat ingest_dataset_retry_results5.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.base_url.txt
+ # 1.00
+ # just a single DOI that failed to crawl, for whatever reason
+
+ cat ingest_dataset_retry_results5.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt
+
+These are ready to crawl, in the existing dataset crawl.
+
+ cat /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule
+
+## Running Uploads Again
+
+Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a
+big bummer! Will need to download many of these over again.
+
+ # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316
+ # skip_cleanup_local_files=True is still default
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json
+
+ # filter out zenodo, very slow:
+ # rg -v 10.5281 \
diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md
new file mode 100644
index 0000000..941519f
--- /dev/null
+++ b/notes/ingest/2022-01-06_patch_crawl.md
@@ -0,0 +1,398 @@
+
+Starting another paper fulltext patch crawl, targetting recent OA content which
+has failed to ingest, and platforms (arxiv, etc).
+
+Specifically:
+
+- "daily" changelog ingest requests from all time, which failed with various status codes
+- pdf no-capture
+- SPN errors
+- terminal-bad-status with 5xx, 429
+- gateway-timeout
+- html no-capture
+- html-resource-no-capture
+
+Most of these are dumped in a single complex query (below),
+
+TODO: html-resource-no-capture (from error message? or do SPN requests separately?)
+
+
+## Initial 'no-capture' Seedlist
+
+Dump terminal URLs (will do ingest requests later, using similar command):
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt';
+ => COPY 6389683
+
+TODO: filter out archive.org/www.archive.org
+
+ cat patch_terminal_url.2022-01-12.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-01-12.uniq.txt
+ => 5.73M 0:00:47 [ 120k/s]
+
+ # note: tweaks and re-ran the above after inspecting this output
+ cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 799045 doi.org
+ 317557 linkinghub.elsevier.com
+ 211091 arxiv.org
+ 204334 iopscience.iop.org
+ 139758 dialnet.unirioja.es
+ 130331 www.scielo.br
+ 124626 www.persee.fr
+ 85764 digitalrepository.unm.edu
+ 83913 www.mdpi.com
+ 79662 www.degruyter.com
+ 75703 www.e-periodica.ch
+ 72206 dx.doi.org
+ 69068 escholarship.org
+ 67848 idus.us.es
+ 57907 zenodo.org
+ 56624 ir.opt.ac.cn
+ 54983 projecteuclid.org
+ 52226 rep.bntu.by
+ 48376 osf.io
+ 48009 pubs.rsc.org
+ 46947 publikationen.ub.uni-frankfurt.de
+ 45564 www.research-collection.ethz.ch
+ 45153 dk.um.si
+ 43313 www.ssoar.info
+ 40543 scholarworks.umt.edu
+
+TODO: cleanup ingest request table in sandcrawler-db:
+- remove filtered OAI-PMH prefixes
+- remove any invalid `base_url` (?)
+
+## More Seedlist (2022-02-08)
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt';
+ => COPY 444764
+
+ cat patch_terminal_url.2022-02-08.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-02-08.uniq.txt
+ => 426k 0:00:04 [ 103k/s]
+
+ cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 60123 www.degruyter.com
+ 59314 arxiv.org
+ 43674 zenodo.org
+ 17771 doi.org
+ 9501 linkinghub.elsevier.com
+ 9379 www.mdpi.com
+ 5691 opendata.uni-halle.de
+ 5578 scholarlypublishingcollective.org
+ 5451 era.library.ualberta.ca
+ 4982 www.cairn.info
+ 4306 www.taylorfrancis.com
+ 4189 papers.ssrn.com
+ 4157 apps.crossref.org
+ 4089 www.sciencedirect.com
+ 4033 mdpi-res.com
+ 3763 dlc.mpg.de
+ 3408 osf.io
+ 2603 www.frontiersin.org
+ 2594 watermark.silverchair.com
+ 2569 journals.lww.com
+ 1787 underline.io
+ 1680 archiviostorico.fondazione1563.it
+ 1658 www.jstage.jst.go.jp
+ 1611 cyberleninka.ru
+ 1535 www.schoeningh.de
+
+ cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule
+ => Done
+
+Copied to crawler svc206 and added to frontier.
+
+
+## Bulk Ingest Requests (2022-02-28)
+
+Note that we are skipping OAI-PMH here, because we just did a separate ingest
+for those.
+
+This is going to dump many duplicate lines (same `base_url`, multiple
+requests), but that is fine. Expecting something like 7 million rows.
+
+ COPY (
+ -- SELECT ingest_file_result.terminal_url
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND ingest_file_result.updated <= '2022-02-08'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ -- ingest_request.link_source = 'oai'
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json';
+ # COPY 3053219
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json
+ => DONE
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md
new file mode 100644
index 0000000..a6f08dd
--- /dev/null
+++ b/notes/ingest/2022-01-13_doi_crawl.md
@@ -0,0 +1,248 @@
+
+Could roll this in to current patch crawl instead of starting a new crawl from scratch.
+
+This file is misnamed; these are mostly non-DOI-specific small updates.
+
+## KBART "almost complete" experimentation
+
+Random 10 releases:
+
+ cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}'
+ https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone
+ https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed
+ https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works
+ https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern)
+ https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy
+ https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success
+ https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref
+ https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success
+ https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success
+ https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed
+
+Try some more!
+
+ https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success
+ https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success?
+ https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry
+ https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site
+ https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI
+ https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success
+ https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success
+ https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken
+ https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub)
+ https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success
+
+
+## Seeds: fixed OJS URLs
+
+Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like:
+
+- `no-pdf-link` with terminal URL like `/article/view/`
+- `redirect-loop` with terminal URL like `/article/view/`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json';
+ => COPY 326577
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json
+ cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Done/running.
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'link-loop'
+ )
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt';
+ => COPY 342415
+
+ cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule
+
+Done/seeded.
+
+## Seeds: scitemed.com
+
+Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%/article/view/%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json';
+ # SKIPPED
+
+Actually there are very few of these.
+
+## Seeds: non-OA paper DOIs
+
+There are many DOIs out there which are likely to be from small publishers, on
+the web, and would ingest just fine (eg, in OJS).
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count
+ 30,938,106
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count
+ 6,664,347
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count
+ 8,258,111
+
+Do the 8 million first, then maybe try the 30.9 million later? Do sampling to
+see how many are actually accessible? From experience with KBART generation,
+many of these are likely to crawl successfully.
+
+ ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz
+ # re-running 2022-02-08 after this VM was upgraded
+ # Expecting 8321448 release objects in search queries
+ # DONE
+
+This is large enough that it will probably be a bulk ingest, and then probably
+a follow-up crawl.
+
+## Seeds: HTML and XML links from HTML biblio
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \
+ | pv -l \
+ | rg '"(html|xml)_fulltext_url"' \
+ | rg '"no-pdf-link"' \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.json.gz
+
+ # cut this off at some point? gzip is terminated weird
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz | wc -l
+ # gzip: ingest_file_result_fulltext_urls.2022-01-13.json.gz: unexpected end of file
+ # 2,538,433
+
+Prepare seedlists (to include in heritrix patch crawl):
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+ | jq .html_biblio.xml_fulltext_url -r \
+ | rg '://' \
+ | sort -u -S 4G \
+ | pv -l \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz
+ # 1.24M 0:01:35 [12.9k/s]
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+ | jq .html_biblio.html_fulltext_url -r \
+ | rg '://' \
+ | sort -u -S 4G \
+ | pv -l \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz
+ # 549k 0:01:27 [6.31k/s]
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+ | cut -f3 -d/ \
+ | sort -S 4G \
+ | uniq -c \
+ | sort -nr \
+ | head -n20
+
+ 534005 dlc.library.columbia.edu
+ 355319 www.degruyter.com
+ 196421 zenodo.org
+ 101450 serval.unil.ch
+ 100631 biblio.ugent.be
+ 47986 digi.ub.uni-heidelberg.de
+ 39187 www.emerald.com
+ 33195 www.cairn.info
+ 25703 boris.unibe.ch
+ 19516 journals.openedition.org
+ 15911 academic.oup.com
+ 11091 repository.dl.itc.u-tokyo.ac.jp
+ 9847 oxfordworldsclassics.com
+ 9698 www.thieme-connect.de
+ 9552 www.idunn.no
+ 9265 www.zora.uzh.ch
+ 8030 www.scielo.br
+ 6543 www.hanspub.org
+ 6229 asmedigitalcollection.asme.org
+ 5651 brill.com
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+ | awk '{print "F+ " $1}' \
+ > ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+ wc -l ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+ 1785901 ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+Added to `JOURNALS-PATCH-CRAWL-2022-01`
+
+## Seeds: most doi.org terminal non-success
+
+Unless it is a 404, should retry.
+
+TODO: generate this list
+
+## Non-OA DOI Bulk Ingest
+
+Had previously run:
+
+ cat ingest_nonoa_doi.json.gz \
+ | rg -v "doi.org/10.2139/" \
+ | rg -v "doi.org/10.1021/" \
+ | rg -v "doi.org/10.1121/" \
+ | rg -v "doi.org/10.1515/" \
+ | rg -v "doi.org/10.1093/" \
+ | rg -v "europepmc.org" \
+ | pv -l \
+ | gzip \
+ > nonoa_doi.filtered.ingests.json.gz
+ # 7.35M 0:01:13 [99.8k/s]
+
+Starting a bulk ingest of these on 2022-03-18, which is *before* the crawl has
+entirely finished, but after almost all queues (domains) have been done for
+several days.
+
+ zcat nonoa_doi.filtered.ingests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Looks like many jstage `no-capture` status; these are still (slowly) crawling.
diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md
new file mode 100644
index 0000000..9722459
--- /dev/null
+++ b/notes/ingest/2022-03_doaj.md
@@ -0,0 +1,278 @@
+
+plan:
+- usual setup and dump ingest requests
+- filter ingest requests to targetted ccTLDs, and add those to crawl first
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz'
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz
+ # 9.08M 0:37:38 [4.02k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373})
+
+
+## Check Pre-Crawl Status
+
+2022-03-09, before the above load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 2919808
+ html | wrong-scope | 1098998
+ pdf | no-pdf-link | 481532
+ pdf | redirect-loop | 429006
+ html | success | 342501
+ html | unknown-scope | 225390
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187762
+ html | no-capture | 185418
+ pdf | no-capture | 171273
+ pdf | null-body | 129028
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91551
+ pdf | link-loop | 25447
+ html | wrong-mimetype | 22640
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ pdf | wrong-mimetype | 7688
+ xml | success | 6897
+ html | petabox-error | 5529
+ pdf | wayback-error | 2706
+ xml | null-body | 2353
+ pdf | | 2063
+ pdf | wayback-content-error | 1349
+ html | cdx-error | 1169
+ pdf | cdx-error | 1130
+ pdf | petabox-error | 679
+ html | | 620
+ pdf | empty-blob | 562
+ html | blocked-cookie | 545
+ (30 rows)
+
+After the above load:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3036457
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108132
+ pdf | no-pdf-link | 485703
+ pdf | redirect-loop | 436085
+ html | success | 342594
+ html | unknown-scope | 225412
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187999
+ html | no-capture | 187310
+ pdf | no-capture | 172033
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91799
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6897
+ html | petabox-error | 5530
+ pdf | wayback-error | 2707
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 771
+ pdf | empty-blob | 562
+ (30 rows)
+
+Dump ingest requests for crawling (or bulk ingest first?):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json';
+ => COPY 353819
+
+Not that many! Guess the filters are important?
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ );
+ => 3202164
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json
+ => 353k 0:00:16 [21.0k/s]
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Dump seeds again (for crawling):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json';
+ # COPY 350661
+
+And stats again:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3037059
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108476
+ pdf | no-pdf-link | 485705
+ pdf | redirect-loop | 436850
+ html | success | 342762
+ html | unknown-scope | 225412
+ html | redirect-loop | 224683
+ html | html-resource-no-capture | 188058
+ html | no-capture | 185734
+ pdf | no-capture | 170452
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91875
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19042
+ html | terminal-bad-status | 13333
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6898
+ html | petabox-error | 5535
+ pdf | wayback-error | 2711
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 772
+ html | blocked-cookie | 769
+ (30 rows)
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json
+
+Create seedlist:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | jq -r .base_url \
+ | sort -u -S 4G \
+ > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt
+
+Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will
+re-ingest when that completes (a week or two?).
+
+
+## Bulk Ingest
+
+After `TARGETED-ARTICLE-CRAWL-2022-03` wrap-up.
+
+ # 2022-03-22
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md
new file mode 100644
index 0000000..d2a8d71
--- /dev/null
+++ b/notes/ingest/2022-03_oaipmh.md
@@ -0,0 +1,40 @@
+
+Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl.
+
+Note that Martin excluded many Indonesian endpoints, will need to follow-up on
+those.
+
+## Prep
+
+Fetch metadata snapshot:
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst
+
+Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large):
+
+ zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \
+ | rg -v 'oai:kb.dk:' \
+ | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \
+ | rg -v 'oai:hispana.mcu.es:' \
+ | rg -v 'oai:bnf.fr:' \
+ | rg -v 'oai:ukm.si:' \
+ | rg -v 'oai:biodiversitylibrary.org:' \
+ | rg -v 'oai:hsp.org:' \
+ | rg -v 'oai:repec:' \
+ | rg -v 'oai:n/a:' \
+ | rg -v 'oai:quod.lib.umich.edu:' \
+ | rg -v 'oai:americanae.aecid.es:' \
+ | rg -v 'oai:www.irgrid.ac.cn:' \
+ | rg -v 'oai:espace.library.uq.edu:' \
+ | rg -v 'oai:edoc.mpg.de:' \
+ | rg -v 'oai:bibliotecadigital.jcyl.es:' \
+ | rg -v 'oai:repository.erciyes.edu.tr:' \
+ | rg -v 'oai:krm.or.kr:' \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz
+
+These failed to transform in the expected way; a change in JSON schema from last time?
diff --git a/notes/ingest/2022-04_targeted.md b/notes/ingest/2022-04_targeted.md
new file mode 100644
index 0000000..23fd35f
--- /dev/null
+++ b/notes/ingest/2022-04_targeted.md
@@ -0,0 +1,144 @@
+
+Want to do a crawl similar to recent "patch" crawls, where we run heritrix
+crawls to "fill in" missing (`no-capture`) and failed dailing ingests (aka,
+those requests coming from fatcat-changelog).
+
+ export PATCHDATE=2022-04-20
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-04
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-04-20.rows.json';
+ # COPY 4842749
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v www.archive.org \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ # 4.75M 0:01:44 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 1515829 www.jstage.jst.go.jp
+ 1052953 doi.org
+ 241704 arxiv.org
+ 219543 www.sciencedirect.com
+ 178562 www.persee.fr
+ 84947 zenodo.org
+ 67397 www.mdpi.com
+ 65775 journals.lww.com
+ 58216 opg.optica.org
+ 50673 osf.io
+ 45776 www.degruyter.com
+ 36664 www.indianjournals.com
+ 35287 pubs.rsc.org
+ 33495 www.bmj.com
+ 33320 www.research-collection.ethz.ch
+ 29728 www.e-periodica.ch
+ 28338 iopscience.iop.org
+ 26364 www.cambridge.org
+ 23840 onlinelibrary.wiley.com
+ 23641 platform.almanhal.com
+ 22660 brill.com
+ 20288 www.osapublishing.org
+ 18561 cgscholar.com
+ 18539 doi.nrct.go.th
+ 15677 www.frontiersin.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+TODO: starting with the "quarterly retry" script/query might make more sense?
+TODO: are there any cases where we do a bulk ingest request, fail, and `terminal_url` is not set?
+
+## Bulk Ingest Requests (post-crawl)
+
+ cd /srv/sandcrawler/src/python
+ sudo su sandcrawler
+ pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json
+ => 4.84M 0:03:14 [24.9k/s]
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => started 2022-05-11
diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md
new file mode 100644
index 0000000..bc78998
--- /dev/null
+++ b/notes/ingest/2022-04_unpaywall.md
@@ -0,0 +1,278 @@
+
+New unpaywall snapshot from `2022-03-09`.
+
+This will probably be the last unpaywall crawl? Will switch to openalex in the
+future, because we can automate that ingest process, and run it on our own
+schedule.
+
+ export SNAPSHOT=2022-03-09
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=UNPAYWALL-CRAWL-2022-04
+
+## Download and Archive
+
+ wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz'
+ # 2022-04-09 22:31:43 (98.9 KB/s) - ‘unpaywall_snapshot_2022-03-09T083001.jsonl.gz’ saved [29470830470/29470830470]
+
+ export SNAPSHOT=2022-03-09
+ ia upload unpaywall_snapshot_$SNAPSHOT unpaywall_snapshot_$SNAPSHOT*.jsonl.gz -m title:"Unpaywall Metadata Snapshot ($SNAPSHOT)" -m collection:ia_biblio_metadata -m creator:creator -m date:$SNAPSHOT
+
+ # if needed
+ scp unpaywall_snapshot_$SNAPSHOT*.jsonl.gz wbgrp-svc506.us.archive.org:/srv/sandcrawler/tasks
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ cd /srv/sandcrawler/src/python
+ sudo su sandcrawler
+ pipenv shell
+
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT*.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json
+ # 34.9M 3:02:32 [3.19k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ # 34.9M 5:23:15 [1.80k/s]
+ # Worker: Counter({'total': 34908779, 'insert-requests': 6129630, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 34908779, 'pushed': 34908779})
+
+So about 6.1M new ingest request rows.
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- take "all time" instead of just this recent capture
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2022-03-09.rows.json';
+ => COPY 6025671
+
+ # transform
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json
+ # 6.03M 0:03:26 [29.1k/s]
+
+ # enqueue for bulk processing
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2022-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3330232
+ success | 2455102
+ redirect-loop | 197117
+ terminal-bad-status | 82618
+ no-pdf-link | 33046
+ blocked-cookie | 16078
+ link-loop | 6745
+ wrong-mimetype | 3416
+ wayback-error | 1385
+ empty-blob | 1142
+ cdx-error | 820
+ body-too-large | 292
+ bad-gzip-encoding | 281
+ wayback-content-error | 267
+ | 253
+ petabox-error | 215
+ skip-url-blocklist | 185
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+After prior "TARGETED" crawl and bulk ingest finished:
+
+ status | count
+ -------------------------+---------
+ no-capture | 3330055
+ success | 2455279
+ redirect-loop | 197117
+ terminal-bad-status | 82618
+ no-pdf-link | 33046
+ blocked-cookie | 16079
+ link-loop | 6745
+ wrong-mimetype | 3416
+ wayback-error | 1385
+ empty-blob | 1142
+ cdx-error | 820
+ body-too-large | 292
+ bad-gzip-encoding | 281
+ wayback-content-error | 267
+ | 253
+ petabox-error | 215
+ skip-url-blocklist | 185
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+Almost no change, which makes sense because of the `ingest_request.created`
+filter.
+
+
+## Dump Seedlist
+
+Dump rows for crawling:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ -- AND date(ingest_request.created) > '2022-04-01'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json';
+ => before ingest and arxiv.org DOI exclusion: COPY 3309091
+ => COPY 3308914
+
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json
+ => 3.31M 0:02:22 [23.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT*
+ 15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt
+ 3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json
+ 3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt
+ 3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt
+
+Inject seedlist into crawler:
+
+ scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+Top domains?
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c | sort -nr | head -n20
+ 158497 www.scielo.br
+ 144732 onlinelibrary.wiley.com
+ 129349 www.researchsquare.com
+ 94923 hal.archives-ouvertes.fr
+ 69293 openresearchlibrary.org
+ 64584 www.cell.com
+ 60033 link.springer.com
+ 50528 www.degruyter.com
+ 49737 projecteuclid.org
+ 45841 www.jstage.jst.go.jp
+ 44819 www.mdpi.com
+ 44325 ieeexplore.ieee.org
+ 38091 dr.lib.iastate.edu
+ 31030 www.nature.com
+ 30300 discovery.ucl.ac.uk
+ 27692 ntrs.nasa.gov
+ 24215 orca.cardiff.ac.uk
+ 23653 www.frontiersin.org
+ 23474 pure.rug.nl
+ 22660 www.sciencedirect.com
+
+
+## Post-Crawl bulk ingest
+
+ # enqueue for bulk processing
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # done: 2022-07-06
+
+## Post-Crawl, Post-Ingest Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2022-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 4784948 => +2,329,669 ~77%
+ redirect-loop | 485270 => + 288,153 ~10%
+ no-capture | 317598 => -3,012,457
+ terminal-bad-status | 267853 => + 185,235 ~ 6%
+ no-pdf-link | 118303 => + 85,257
+ blocked-cookie | 111373 => + 95,294
+ skip-url-blocklist | 19368
+ link-loop | 9091
+ wrong-mimetype | 7163
+ cdx-error | 2516
+ empty-blob | 1961
+ wayback-error | 1922
+ body-too-large | 509
+ petabox-error | 416
+ wayback-content-error | 341
+ bad-gzip-encoding | 281
+ | 253
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+Groovy!
diff --git a/notes/ingest/2022-07-15_ingest_fixes.md b/notes/ingest/2022-07-15_ingest_fixes.md
new file mode 100644
index 0000000..ec31a7d
--- /dev/null
+++ b/notes/ingest/2022-07-15_ingest_fixes.md
@@ -0,0 +1,831 @@
+
+## HTML `html-resource-no-capture` Fixes
+
+Tracing down some `html-resource-no-capture` issues. Eg, `javascript:` resources causing errors.
+
+SQL query:
+
+ select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' limit 100;
+ select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' order by random() limit 100;
+
+ select count(*) from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture';
+ => 210,528
+
+http://agroengineering.it/index.php/jae/article/view/568/609
+- old capture, from `20171017204935`
+- missing .css file; seems like an actual case of missing content?
+- TODO: re-crawl/re-ingest when CDX is old
+
+https://www.karger.com/Article/FullText/484130
+- missing: https://www.karger.com/WebMaterial/ShowThumbnail/895999?imgType=2
+- resource is live
+- this was from DOI-LANDING crawl, no resources captured
+- TODO: re-crawl
+
+https://www.mdpi.com/1996-1073/13/21/5563/htm
+- missing: https://www.mdpi.com/1996-1073/13/21/5563/htm
+- common crawl capture; no/few resources?
+- TODO: re-crawl
+
+http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-736X2013000500011&lng=en&tlng=en
+- missing: http://www.scielo.br/img/revistas/pvb/v33n5/a11tab01.jpg
+ not on live web
+- old (2013) wide crawl
+- TODO: re-crawl
+
+http://g3journal.org/lookup/doi/10.1534/g3.116.027730
+- missing: http://www.g3journal.org/sites/default/files/highwire/ggg/6/8/2553/embed/mml-math-4.gif
+- old 2018 landing crawl (no resources)
+- TODO: re-crawl
+
+https://www.frontiersin.org/articles/10.3389/fimmu.2020.576134/full
+- "error_message": "revisit record missing URI and/or DT: warc:abc.net.au-news-20220328-130654/IA-FOC-abc.net.au-news-20220618135308-00003.warc.gz offset:768320762"
+- specific URL: https://www.frontiersin.org/areas/articles/js/app?v=uC9Es8wJ9fbTy8Rj4KipiyIXvhx7XEVhCTHvIrM4ShA1
+- archiveteam crawl
+- seems like a weird corner case. look at more 'frontiersin' articles, and re-crawl this page
+
+https://www.frontiersin.org/articles/10.3389/fonc.2020.01386/full
+- WORKING
+
+https://doi.org/10.4000/trajectoires.2317
+- redirect: https://journals.openedition.org/trajectoires/2317
+- missing: "https://journals.openedition.org/trajectoires/Ce fichier n'existe pas" (note spaces)
+- FIXED
+
+http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S1413-81232002000200008&lng=en&tlng=en
+- WORKING
+
+https://f1000research.com/articles/9-571/v2
+- petabox-error on 'https://www.recaptcha.net/recaptcha/api.js'
+- added recaptcha.net to blocklist
+- still needs a re-crawl
+- SPN capture, from 2020, but images were missing?
+- re-capture has images (though JS still wonky)
+- TODO: re-crawl with SPN2
+
+http://bio.biologists.org/content/4/9/1163
+- DOI LANDING crawl, no sub-resources
+- TODO: recrawl
+
+http://err.ersjournals.com/content/26/145/170039.full
+- missing: http://err.ersjournals.com/sites/default/files/highwire/errev/26/145/170039/embed/graphic-5.gif
+ on live web
+- 2017 targetted heritrix crawl
+- TODO: recrawl
+
+http://www.dovepress.com/synthesis-characterization-and-antimicrobial-activity-of-an-ampicillin-peer-reviewed-article-IJN
+- missing: https://www.dovepress.com/cr_data/article_fulltext/s61000/61143/img/IJN-61143-F02-Thumb.jpg
+- recent archiveteam crawl
+- TODO: recrawl
+
+http://journals.ed.ac.uk/lithicstudies/article/view/1444
+- missing: http://journals.ed.ac.uk/lithicstudies/article/download/1444/2078/6081
+- common crawl
+- TODO: recrawl
+
+http://medisan.sld.cu/index.php/san/article/view/495
+- missing: http://ftp.scu.sld.cu/galen/medisan/logos/redib.jpg
+- this single resource is legit missing
+
+seems like it probably isn't a bad idea to just re-crawl all of these with fresh SPNv2 requests
+
+request sources:
+- fatcat-changelog (doi)
+- fatcat-ingest (doi)
+- doaj
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'html'
+ AND ingest_file_result.status = 'html-resource-no-capture'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json';
+ => COPY 210749
+
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json
+
+Try a sample of 300:
+
+ shuf -n300 /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Seeing a bunch of:
+
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fphys.2020.00454/full","https://www.frontiersin.org/articles/10.3389/fphys.2020.00454/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fmicb.2019.02507/full","https://www.frontiersin.org/articles/10.3389/fmicb.2019.02507/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+ ["doaj","wayback-content-error","https://www.mdpi.com/2218-1989/10/9/366","https://www.mdpi.com/2218-1989/10/9/366/htm","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:964129887"]
+
+ "error_message": "revisit record missing URI and/or DT: warc:online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz offset:751923069",
+
+
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fnins.2020.00724/full","https://www.frontiersin.org/articles/10.3389/fnins.2020.00724/full","wayback payload sha1hex mismatch: 20220715222216 https://static.frontiersin.org/areas/articles/js/app?v=DfnFHSIgqDJBKQy2bbQ2S8vWyHe2dEMZ1Lg9o6vSS1g1"]
+
+These seem to be transfer encoding issues; fixed?
+
+ ["doaj","html-resource-no-capture","http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S0021-25712013000400003&lng=en&tlng=en","https://scielosp.org/article/aiss/2013.v49n4/336-339/en/","HTML sub-resource not found: https://ssm.scielo.org/media/assets/css/scielo-print.css"]
+
+Full batch:
+
+ # TODO: cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Not running the full batch for now, because there are almost all `wayback-content-error` issues.
+
+ cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | wc -l
+ 114935
+
+ cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+
+## Redirect Loops
+
+Seems like there might have been a bug in how ingest pipeline dealt with
+multiple redirects (eg, 301 to 302 or vice-versa), due to how CDX lookups and
+normalization was happening.
+
+This could be a really big deal because we have over 11 million such ingest
+requests! and may even have stopped crawling domains on the basis of redirect
+looping.
+
+ select * from ingest_file_result where ingest_type = 'pdf' and status = 'redirect-loop' limit 50;
+
+http://ieeexplore.ieee.org/iel7/7259950/7275573/07275755.pdf
+- 'skip-url-blocklist'
+- paywall on live web
+
+http://www.redjournal.org/article/S0360301616308276/pdf
+- redirect to 'secure.jbs.elsevierhealth.com'
+- ... but re-crawling with SPNv2 worked
+- TODO: reingest this entire journal with SPNv2
+
+http://www.jmirs.org/article/S1939865415001551/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+http://www.cell.com/article/S0006349510026147/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- TODO: try SPNv2?
+- RECRAWL: success
+
+http://infoscience.epfl.ch/record/256431/files/SPL_2018.pdf
+- FIXED: success
+
+http://www.nature.com/articles/hdy1994143.pdf
+- blocked-cookie (idp.nature.com / cookies_not_supported)
+- RECRAWL: gateway-timeout
+
+http://www.thelancet.com/article/S0140673619327606/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+https://pure.mpg.de/pubman/item/item_2065970_2/component/file_2065971/Haase_2014.pdf
+- FIXED: success
+
+http://hdl.handle.net/21.11116/0000-0001-B1A2-F
+- FIXED: success
+
+http://repositorio.ufba.br/ri/bitstream/ri/6072/1/%2858%29v21n6a03.pdf
+- FIXED: success
+
+http://www.jto.org/article/S1556086416329999/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+http://www.jahonline.org/article/S1054139X16303020/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+So, wow wow wow, a few things to do here:
+
+- just re-try all these redirect-loop attempts to update status
+- re-ingest all these elsevierhealth blocked crawls with SPNv2. this could take a long time!
+
+Possibly the elsevierhealth stuff will require some deeper fiddling to crawl
+correctly.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'redirect-loop'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json';
+ => COPY 6611342
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json
+
+Start with a sample:
+
+ shuf -n200 /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Wow that is a lot of ingest! And a healthy fraction of 'success', almost all
+via unpaywall (maybe should have done DOAJ/DOI only first). Let's do this full
+batch:
+
+ cat /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+TODO: repeat with broader query (eg, OAI-PMH, MAG, etc).
+
+## Other
+
+Revist resolution failed: \"Didn't get exact CDX url/datetime match. url:https://www.cairn.info/static/images//logo/logo-cairn-negatif.png dt:20220430145322 got:CdxRow(surt='info,cairn)/static/images/logo/logo-cairn-negatif.png', datetime='20220430145322', url='https://www.cairn.info/static/images/logo/logo-cairn-negatif.png', mimetype='image/png', status_code=200, sha1b32='Y3VQOPO2NFUR2EUWNXLYGYGNZPZLQYHU', sha1hex='c6eb073dda69691d12966dd78360cdcbf2b860f4', warc_csize=10875, warc_offset=2315284914, warc_path='archiveteam_archivebot_go_20220430212134_59230631/old.worldurbancampaign.org-inf-20220430-140628-acnq5-00000.warc.gz')\""
+
+ https://www.cairn.info/static/images//logo/logo-cairn-negatif.png 20220430145322
+ https://www.cairn.info/static/images/logo/logo-cairn-negatif.png 20220430145322
+
+Fixed!
+
+
+## Broken WARC Record?
+
+cdx line:
+
+ net,cloudfront,d1bxh8uas1mnw7)/assets/embed.js 20220716084026 https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js warc/revisit - U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB - - 660 751923069 online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz
+
+download WARC and run:
+
+ zcat IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz | rg d1bxh8uas1mnw7.cloudfront.net/assets/embed.js -a -C 20
+
+the WARC record:
+
+ WARC/1.0
+ WARC-Type: revisit
+ WARC-Target-URI: https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js
+ WARC-Date: 2022-07-16T08:40:26Z
+ WARC-Payload-Digest: sha1:U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB
+ WARC-IP-Address: 13.227.21.220
+ WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+ WARC-Truncated: length
+ WARC-Record-ID: <urn:uuid:cc79139e-d43f-4b43-9b9e-f923610344d0>
+ Content-Type: application/http; msgtype=response
+ Content-Length: 493
+
+ HTTP/1.1 200 OK
+ Content-Type: application/javascript
+ Content-Length: 512
+ Connection: close
+ Last-Modified: Fri, 22 Apr 2022 08:45:38 GMT
+ Accept-Ranges: bytes
+ Server: AmazonS3
+ Date: Fri, 15 Jul 2022 16:36:08 GMT
+ ETag: "1c28db48d4012f0221b63224a3bb7137"
+ Vary: Accept-Encoding
+ X-Cache: Hit from cloudfront
+ Via: 1.1 5b475307685b5cecdd0df414286f5438.cloudfront.net (CloudFront)
+ X-Amz-Cf-Pop: SFO20-C1
+ X-Amz-Cf-Id: SIRR_1LT8mkp3QVaiGYttPuomxyDfJ-vB6dh0Slg_qqyW0_WwnA1eg==
+ Age: 57859
+
+where are the `WARC-Refers-To-Target-URI` and `WARC-Refers-To-Date` lines?
+
+## osf.io
+
+ select status, terminal_status_code, count(*) from ingest_file_result where base_url LIKE 'https://doi.org/10.17605/osf.io/%' and ingest_type = 'pdf' group by status, terminal_status_code order by count(*) desc limit 30;
+
+ status | terminal_status_code | count
+ -------------------------+----------------------+-------
+ terminal-bad-status | 404 | 92110
+ no-pdf-link | 200 | 46932
+ not-found | 200 | 20212
+ no-capture | | 8599
+ success | 200 | 7604
+ redirect-loop | 301 | 2125
+ terminal-bad-status | 503 | 1657
+ cdx-error | | 1301
+ wrong-mimetype | 200 | 901
+ terminal-bad-status | 410 | 364
+ read-timeout | | 167
+ wayback-error | | 142
+ gateway-timeout | | 139
+ terminal-bad-status | 500 | 76
+ spn2-error | | 63
+ spn2-backoff | | 42
+ petabox-error | | 39
+ spn2-backoff | 200 | 27
+ redirect-loop | 302 | 19
+ terminal-bad-status | 400 | 15
+ terminal-bad-status | 401 | 15
+ remote-server-error | | 14
+ timeout | | 11
+ terminal-bad-status | | 11
+ petabox-error | 200 | 10
+ empty-blob | 200 | 8
+ null-body | 200 | 6
+ spn2-error:unknown | | 5
+ redirect-loop | 308 | 4
+ spn2-cdx-lookup-failure | | 4
+ (30 rows)
+
+Many of these are now non-existant, or datasets/registrations not articles.
+Hrm.
+
+
+## Large DOAJ no-pdf-link Domains
+
+ SELECT
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain,
+ COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_request.base_url = ingest_file_result.base_url
+ WHERE
+ ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source = 'doaj'
+ GROUP BY
+ domain
+ ORDER BY
+ COUNT(*) DESC
+ LIMIT 50;
+
+ domain | count
+ -------------------------------------------------------+--------
+ www.sciencedirect.com | 211090
+ auth.openedition.org | 20741
+ journal.frontiersin.org:80 | 11368
+ journal.frontiersin.org | 6494
+ ejde.math.txstate.edu | 4301
+ www.arkat-usa.org | 4001
+ www.scielo.br | 3736
+ www.lcgdbzz.org | 2892
+ revistas.uniandes.edu.co | 2715
+ scielo.sld.cu | 2612
+ www.egms.de | 2488
+ journals.lww.com | 2415
+ ter-arkhiv.ru | 2239
+ www.kitlv-journals.nl | 2076
+ www.degruyter.com | 2061
+ jwcn-eurasipjournals.springeropen.com | 1929
+ www.cjcnn.org | 1908
+ www.aimspress.com | 1885
+ vsp.spr-journal.ru | 1873
+ dx.doi.org | 1648
+ www.dlib.si | 1582
+ aprendeenlinea.udea.edu.co | 1548
+ www.math.u-szeged.hu | 1448
+ dergipark.org.tr | 1444
+ revistas.uexternado.edu.co | 1429
+ learning-analytics.info | 1419
+ drive.google.com | 1399
+ www.scielo.cl | 1326
+ www.economics-ejournal.org | 1267
+ www.jssm.org | 1240
+ html.rhhz.net | 1232
+ journalofinequalitiesandapplications.springeropen.com | 1214
+ revistamedicina.net | 1197
+ filclass.ru | 1154
+ ceramicayvidrio.revistas.csic.es | 1152
+ gynecology.orscience.ru | 1126
+ www.tobaccoinduceddiseases.org | 1090
+ www.tandfonline.com | 1046
+ www.querelles-net.de | 1038
+ www.swjpcc.com | 1032
+ microbiologyjournal.org | 1028
+ revistas.usal.es | 1027
+ www.medwave.cl | 1023
+ ijtech.eng.ui.ac.id | 1023
+ www.scielo.sa.cr | 1021
+ vestnik.szd.si | 986
+ www.biomedcentral.com:80 | 984
+ scielo.isciii.es | 983
+ bid.ub.edu | 970
+ www.meirongtv.com | 959
+ (50 rows)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ejde.math.txstate.edu%' limit 5;
+ http://ejde.math.txstate.edu/Volumes/2018/30/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2012/137/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2016/268/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2015/194/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2014/43/abstr.html
+ # plain HTML, not really parse-able
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.arkat-usa.org%' limit 5;
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0013.909
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0007.717
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.p008.158
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0014.216
+ # fixed (embed PDF)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.scielo.br%' limit 5;
+ https://doi.org/10.5935/0034-7280.20200075
+ https://doi.org/10.5935/0004-2749.20200071
+ https://doi.org/10.5935/0034-7280.20200035
+ http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1516-44461999000400014
+ https://doi.org/10.5935/0034-7280.20200047
+ # need recrawls?
+ # then success
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.lcgdbzz.org%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://revistas.uniandes.edu.co%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://scielo.sld.cu%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.egms.de%' limit 5;
+ https://doi.org/10.3205/16dgnc020
+ http://nbn-resolving.de/urn:nbn:de:0183-19degam1126
+ http://www.egms.de/en/meetings/dgpraec2019/19dgpraec032.shtml
+ http://www.egms.de/en/meetings/dkou2019/19dkou070.shtml
+ http://nbn-resolving.de/urn:nbn:de:0183-20nrwgu625
+ # mostly abstracts, don't have PDF versions
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ter-arkhiv.ru%' limit 5;
+ https://doi.org/10.26442/terarkh201890114-47
+ https://doi.org/10.26442/00403660.2019.12.000206
+ https://journals.eco-vector.com/0040-3660/article/download/32246/pdf
+ https://journals.eco-vector.com/0040-3660/article/download/33578/pdf
+ https://doi.org/10.26442/00403660.2019.12.000163
+ # working, needed recrawls (some force re-crawls)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.kitlv-journals.nl%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.cjcnn.org%' limit 5;
+
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.dlib.si%' limit 5;
+ https://srl.si/ojs/srl/article/view/2910
+ https://srl.si/ojs/srl/article/view/3640
+ https://srl.si/ojs/srl/article/view/2746
+ https://srl.si/ojs/srl/article/view/2557
+ https://srl.si/ojs/srl/article/view/2583
+ # fixed? (dlib.si)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.jssm.org%' limit 5;
+ http://www.jssm.org/vol4/n4/8/v4n4-8text.php
+ http://www.jssm.org/vol7/n1/19/v7n1-19text.php
+ http://www.jssm.org/vol9/n3/10/v9n3-10text.php
+ http://www.jssm.org/abstresearcha.php?id=jssm-14-347.xml
+ http://www.jssm.org/vol7/n2/11/v7n2-11text.php
+ # works as an HTML document? otherwise hard to select on PDF link
+
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://filclass.ru%' limit 5;
+ https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism
+ https://filclass.ru/en/archive/2015/42/training-as-an-effective-form-of-preparation-for-the-final-essay
+ https://filclass.ru/en/archive/2020/vol-25-3/didaktizatsiya-literatury-rossijskikh-nemtsev-zanyatie-po-poeme-viktora-klyajna-jungengesprach
+ https://filclass.ru/en/archive/2015/40/the-communicative-behaviour-of-the-russian-intelligentsia-and-its-reflection-in-reviews-as-a-genre-published-in-online-literary-journals-abroad
+ https://filclass.ru/en/archive/2016/46/discoursive-means-of-implication-of-instructive-components-within-the-anti-utopia-genre
+ # fixed
+ # TODO: XXX: re-crawl/ingest
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://microbiologyjournal.org%' limit 5;
+ https://microbiologyjournal.org/the-relationship-between-the-type-of-infection-and-antibiotic-resistance/
+ https://microbiologyjournal.org/antimicrobial-resistant-shiga-toxin-producing-escherichia-coli-isolated-from-ready-to-eat-meat-products-and-fermented-milk-sold-in-the-formal-and-informal-sectors-in-harare-zimbabwe/
+ https://microbiologyjournal.org/emerging-antibiotic-resistance-in-mycoplasma-microorganisms-designing-effective-and-novel-drugs-therapeutic-targets-current-knowledge-and-futuristic-prospects/
+ https://microbiologyjournal.org/microbiological-and-physicochemicalpropertiesofraw-milkproduced-from-milking-to-delivery-to-milk-plant/
+ https://microbiologyjournal.org/association-of-insulin-based-insulin-resistance-with-liver-biomarkers-in-type-2-diabetes-mellitus/
+ # HTML article, no PDF
+ # ... but only sometimes
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.medwave.cl%' limit 5;
+ http://www.medwave.cl/link.cgi/Medwave/Perspectivas/Cartas/6878
+ https://www.medwave.cl/link.cgi/Medwave/Revisiones/RevisionClinica/8037.act
+ http://dx.doi.org/10.5867/medwave.2012.03.5332
+ https://www.medwave.cl/link.cgi/Medwave/Estudios/Casos/7683.act
+ http://www.medwave.cl/link.cgi/Medwave/Revisiones/CAT/5964
+ # HTML article, no PDF
+
+Re-ingest HTML:
+
+ https://fatcat.wiki/container/mafob4ewkzczviwipyul7knndu (DONE)
+ https://fatcat.wiki/container/6rgnsrp3rnexdoks3bxcmbleda (DONE)
+
+Re-ingest PDF:
+
+ doi_prefix:10.5935 (DONE)
+ doi_prefix:10.26442
+
+## More Scielo
+
+More scielo? `doi_prefix:10.5935 in_ia:false`
+
+ http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873
+ # OJS? fixed
+
+ https://revistas.unicentro.br/index.php/repaa/article/view/2667/2240
+ # working, but needed re-crawl
+
+ http://www.rbcp.org.br/details/2804/piezoelectric-preservative-rhinoplasty--an-alternative-approach-for-treating-bifid-nose-in-tessier-no--0-facial-cleft
+
+A few others, mostly now working
+
+## Recent OA DOIs
+
+ fatcat-cli search release 'is_oa:true (type:article-journal OR type:article OR type:paper-conference) !doi_prefix:10.5281 !doi_prefix:10.6084 !doi_prefix:10.48550 !doi_prefix:10.25446 !doi_prefix:10.25384 doi:* date:>2022-06-15 date:<2022-07-15 in_ia:false !publisher_type:big5' --index-json --limit 0 | pv -l > recent_missing_oa.json
+
+ wc -l recent_missing_oa.json
+ 24433
+
+ cat recent_missing_oa.json | jq .doi_prefix -r | sort | uniq -c | sort -nr | head
+ 4968 10.3390
+ 1261 10.1080
+ 687 10.23668
+ 663 10.1021
+ 472 10.1088
+ 468 10.4000
+ 367 10.3917
+ 357 10.1364
+ 308 10.4230
+ 303 10.17863
+
+ cat recent_missing_oa.json | jq .doi_registrar -r | sort | uniq -c | sort -nr
+ 19496 crossref
+ 4836 datacite
+ 101 null
+
+ cat recent_missing_oa.json | jq .publisher_type -r | sort | uniq -c | sort -nr
+ 9575 longtail
+ 8419 null
+ 3861 society
+ 822 unipress
+ 449 oa
+ 448 scielo
+ 430 commercial
+ 400 repository
+ 22 other
+ 7 archive
+
+ cat recent_missing_oa.json | jq .publisher -r | sort | uniq -c | sort -nr | head
+ 4871 MDPI AG
+ 1107 Informa UK (Taylor & Francis)
+ 665 EAG-Publikationen
+ 631 American Chemical Society
+ 451 IOP Publishing
+ 357 The Optical Society
+ 347 OpenEdition
+ 309 CAIRN
+ 308 Schloss Dagstuhl - Leibniz-Zentrum für Informatik
+ 303 Apollo - University of Cambridge Repository
+
+ cat recent_missing_oa.json | jq .container_name -r | sort | uniq -c | sort -nr | head
+ 4908 null
+ 378 Sustainability
+ 327 ACS Omega
+ 289 Optics Express
+ 271 International Journal of Environmental Research and Public Health
+ 270 International Journal of Health Sciences
+ 238 Sensors
+ 223 International Journal of Molecular Sciences
+ 207 Molecules
+ 193 Proceedings of the National Academy of Sciences of the United States of America
+
+ cat recent_missing_oa.json \
+ | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+ | wc -l
+ 16558
+
+ cat recent_missing_oa.json | rg -i mdpi | shuf -n10 | jq .doi -r
+ 10.3390/molecules27144419
+ => was a 404
+ => recrawl was successful
+ 10.3390/math10142398
+ => was a 404
+ 10.3390/smartcities5030039
+ => was a 404
+
+Huh, we need to re-try/re-crawl MDPI URLs every week or so? Or special-case this situation.
+Could be just a fatcat script, or a sandcrawler query.
+
+ cat recent_missing_oa.json \
+ | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+ | shuf -n10 | jq .doi -r
+
+ https://doi.org/10.18452/24860
+ => success (just needed quarterly retry?)
+ => b8c6c86aebd6cd2d85515441bbce052bcff033f2 (not in fatcat.wiki)
+ => current status is "bad-redirect"
+ https://doi.org/10.26181/20099540.v1
+ => success
+ => 3f9b1ff2a09f3ea9051dbbef277579e8a0b4df30
+ => this is figshare, and versioned. PDF was already attached to another DOI: https://doi.org/10.26181/20099540
+ https://doi.org/10.4230/lipics.sea.2022.22
+ => there is a bug resulting in trailing slash in `citation_pdf_url`
+ => fixed as a quirks mode
+ => emailed to report
+ https://doi.org/10.3897/aca.5.e89679
+ => success
+ => e6fd1e066c8a323dc56246631748202d5fb48808
+ => current status is 'bad-redirect'
+ https://doi.org/10.1103/physrevd.105.115035
+ => was 404
+ => success after force-recrawl of the terminal URL (not base URL)
+ https://doi.org/10.1155/2022/4649660
+ => was 404
+ => success after force-recrawl (of base_url)
+ https://doi.org/10.1090/spmj/1719
+ => paywall (not actually OA)
+ => https://fatcat.wiki/container/x6jfhegb3fbv3bcbqn2i3espiu is on Szczepanski list, but isn't all OA?
+ https://doi.org/10.1139/as-2022-0011
+ => was no-pdf-link
+ => fixed fulltext URL extraction
+ => still needed to re-crawl terminal PDF link? hrm
+ https://doi.org/10.31703/grr.2022(vii-ii).02
+ => was no-pdf-link
+ => fixed! success
+ https://doi.org/10.1128/spectrum.00154-22
+ => was 404
+ => now repeatably 503, via SPN
+ https://doi.org/10.51601/ijersc.v3i3.393
+ => 503 server error
+ https://doi.org/10.25416/ntr.20137379.v1
+ => is figshare
+ => docx (not PDF)
+ https://doi.org/10.25394/pgs.20263698.v1
+ => figshare
+ => embargo'd
+ https://doi.org/10.24850/j-tyca-14-4-7
+ => was no-pdf-link
+ => docs.google.com/viewer (!)
+ => now handle this (success)
+ https://doi.org/10.26267/unipi_dione/1832
+ => was bad-redirect
+ => success
+ https://doi.org/10.25560/98019
+ => body-too-large
+ => also, PDF metadata fails to parse
+ => is actually like 388 MByte
+ https://doi.org/10.14738/abr.106.12511
+ => max-hops-exceeded
+ => bumped max-hops from 6 to 8
+ => then success (via google drive)
+ https://doi.org/10.24350/cirm.v.19933803
+ => video, not PDF
+ https://doi.org/10.2140/pjm.2022.317.67
+ => link-loop
+ => not actually OA
+ https://doi.org/10.26265/polynoe-2306
+ => was bad-redirect
+ => now success
+ https://doi.org/10.3389/fpls.2022.826875
+ => frontiers
+ => was terminal-bad-status (403)
+ => success on retry (not sure why)
+ => maybe this is also a date-of-publication thing?
+ => not sure all these should be retried though
+ https://doi.org/10.14198/medcom.22240
+ => was terminal-bad-status (404)
+ => force-recrawl resulted in an actual landing page, but still no-pdf-link
+ => but actual PDF is a real 404, it seems. oh well
+ https://doi.org/10.31729/jnma.7579
+ => no-capture
+ https://doi.org/10.25373/ctsnet.20146931.v2
+ => figshare
+ => video, not document or PDF
+ https://doi.org/10.1007/s42600-022-00224-0
+ => not yet crawled/attempted (!)
+ => springer
+ => not actually OA
+ https://doi.org/10.37391/ijeer.100207
+ => some upstream issue (server not found)
+ https://doi.org/10.1063/5.0093946
+ => aip.scitation.org, is actually OA (can download in browser)
+ => cookie trap?
+ => redirect-loop (seems like a true redirect loop)
+ => retrying the terminal PDF URL seems to have worked
+ https://doi.org/10.18502/jchr.v11i2.9998
+ => no actual fulltext on publisher site
+ https://doi.org/10.1128/spectrum.01144-22
+ => this is a 503 error, even after retrying. weird!
+
+DONE: check `publisher_type` in chocula for:
+- "MDPI AG"
+- "Informa UK (Taylor & Francis)"
+
+ cat recent_missing_oa.json | jq '[.publisher, .publisher_type]' -c | sort | uniq -c | sort -nr | head -n40
+ 4819 ["MDPI AG","longtail"]
+ 924 ["Informa UK (Taylor & Francis)",null]
+ 665 ["EAG-Publikationen",null]
+ 631 ["American Chemical Society","society"]
+ 449 ["IOP Publishing","society"]
+ 357 ["The Optical Society","society"]
+ 336 ["OpenEdition","oa"]
+ 309 ["CAIRN","repository"]
+ 308 ["Schloss Dagstuhl - Leibniz-Zentrum für Informatik",null]
+ 303 ["Apollo - University of Cambridge Repository",null]
+ 292 ["Springer (Biomed Central Ltd.)",null]
+ 275 ["Purdue University Graduate School",null]
+ 270 ["Suryasa and Sons","longtail"]
+ 257 ["La Trobe",null]
+ 216 ["Frontiers Media SA","longtail"]
+ 193 ["Proceedings of the National Academy of Sciences","society"]
+ 182 ["Informa UK (Taylor & Francis)","longtail"]
+ 176 ["American Physical Society","society"]
+ 168 ["Institution of Electrical Engineers","society"]
+ 166 ["Oxford University Press","unipress"]
+ 153 ["Loughborough University",null]
+
+ chocula mostly seems to set these correctly. is the issue that the chocula
+ computed values aren't coming through or getting updated? probably. both
+ the release (from container) metadata update; and chocula importer not
+ doing updates based on this field; and some old/incorrect values.
+
+ did some cleanups of specific containers, and next chocula update should
+ result in a bunch more `publisher_type` getting populated on older
+ containers
+
+
+TODO: verify URLs are actualy URLs... somewhere? in the ingest pipeline
+
+TODO: fatcat: don't ingest figshare "work" DOIs, only the "versioned" ones (?)
+ doi_prefix:10.26181
+
+WIP: sandcrawler: regularly (weekly?) re-try 404 errors (the terminal URL, not the base url?) (or, some kind of delay?)
+ doi_prefix:10.3390 (MDPI)
+ doi_prefix:10.1103
+ doi_prefix:10.1155
+
+DONE: simply re-ingest all:
+ doi_prefix:10.4230
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf query 'doi_prefix:10.4230'
+ # Counter({'ingest_request': 2096, 'elasticsearch_release': 2096, 'estimate': 2096, 'kafka': 2096})
+ container_65lzi3vohrat5nnymk3dqpoycy
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 65lzi3vohrat5nnymk3dqpoycy
+ # Counter({'ingest_request': 187, 'elasticsearch_release': 187, 'estimate': 187, 'kafka': 187})
+ container_5vp2bio65jdc3blx6rfhp3chde
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 5vp2bio65jdc3blx6rfhp3chde
+ # Counter({'ingest_request': 83, 'elasticsearch_release': 83, 'estimate': 83, 'kafka': 83})
+
+DONE: verify and maybe re-ingest all:
+ is_oa:true publisher:"Canadian Science Publishing" in_ia:false
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --allow-non-oa --ingest-type pdf --force-recrawl query 'year:>2010 is_oa:true publisher:"Canadian Science Publishing" in_ia:false !journal:print'
+ # Counter({'ingest_request': 1041, 'elasticsearch_release': 1041, 'estimate': 1041, 'kafka': 1041})
+
+
+## Re-Ingest bad-redirect, max-hops-exceeded, and google drive
+
+Similar to `redirect-loop`:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'bad-redirect'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json';
+ # COPY 100011
+ # after first run: COPY 5611
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'max-hops-exceeded'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json';
+ # COPY 3546
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.hit is false
+ AND ingest_file_result.terminal_url like 'https://docs.google.com/viewer%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json';
+ # COPY 1082
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json
+
+ cat /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ cat /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ cat /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ # DONE
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md
new file mode 100644
index 0000000..74aeb8d
--- /dev/null
+++ b/notes/ingest/2022-07-19_dblp.md
@@ -0,0 +1,50 @@
+
+Cross-posting from fatcat bulk metadata update/ingest.
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 631k 0:00:11 [54.0k/s]
+
+
+## Post-Crawl Stats
+
+This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
+2022-09-06:
+
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'dblp'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+-----------------------+--------
+ pdf | success | 305142
+ pdf | no-pdf-link | 192683
+ pdf | no-capture | 42634
+ pdf | terminal-bad-status | 38041
+ pdf | skip-url-blocklist | 31055
+ pdf | link-loop | 9263
+ pdf | wrong-mimetype | 4545
+ pdf | redirect-loop | 3952
+ pdf | empty-blob | 2705
+ pdf | wayback-content-error | 834
+ pdf | wayback-error | 294
+ pdf | petabox-error | 202
+ pdf | blocked-cookie | 155
+ pdf | cdx-error | 115
+ pdf | body-too-large | 66
+ pdf | bad-redirect | 19
+ pdf | timeout | 7
+ pdf | bad-gzip-encoding | 4
+ (18 rows)
+
+That is quite a lot of `no-pdf-link`, might be worth doing a random sample
+and/or re-ingest. And a chunk of `no-capture` to retry.
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+ export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+ # 9.72M 0:36:28 [4.44k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # 9.72M 0:17:04 [9.49k/s]
+ # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3165539
+ pdf | | 2078874
+ html | | 1547698
+ html | wrong-scope | 1114332
+ pdf | no-pdf-link | 517261
+ html | success | 388376
+ html | unknown-scope | 242044
+ pdf | no-capture | 179030
+ pdf | terminal-bad-status | 174741
+ html | no-capture | 155323
+ pdf | null-body | 129267
+ pdf | redirect-loop | 127136
+ html | html-resource-no-capture | 117275
+ html | null-body | 100296
+ pdf | blocked-cookie | 71093
+ html | redirect-loop | 65519
+ html | terminal-bad-status | 64856
+ html | blocked-cookie | 64095
+ html | spn2-backoff | 55173
+ pdf | link-loop | 27440
+ html | wrong-mimetype | 26016
+ html | wayback-content-error | 20109
+ xml | | 13624
+ pdf | wrong-mimetype | 8411
+ xml | success | 6899
+ html | petabox-error | 6199
+ html | wayback-error | 5269
+ html | spn2-cdx-lookup-failure | 4635
+ html | spn2-recent-capture | 4527
+ xml | null-body | 2353
+ (30 rows)
+
+## Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+ # COPY 3962331
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+ # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 789988 www.mdpi.com
+ 318142 www.frontiersin.org
+ 226316 link.springer.com
+ 204429 www.scielo.br
+ 201175 www.sciencedirect.com
+ 72852 ieeexplore.ieee.org
+ 68983 dx.doi.org
+ 33286 www.dovepress.com
+ 26020 elifesciences.org
+ 23838 www.cetjournal.it
+ 21102 mab-online.nl
+ 20242 www.revistas.usp.br
+ 16564 periodicos.uem.br
+ 15710 journals.openedition.org
+ 14514 dergipark.org.tr
+ 14072 apcz.umk.pl
+ 13924 ojs.minions.amsterdam
+ 13717 bmgn-lchr.nl
+ 13512 ojstest.minions.amsterdam
+ 10440 journals.asm.org
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # Done
+
+## Stats Again
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 4704006
+ html | wrong-scope | 1761227
+ html | success | 778165
+ pdf | no-pdf-link | 759805
+ html | no-capture | 382080
+ html | unknown-scope | 313391
+ html | html-resource-no-capture | 292953
+ pdf | no-capture | 290311
+ pdf | terminal-bad-status | 271776
+ pdf | null-body | 129267
+ pdf | blocked-cookie | 108491
+ html | terminal-bad-status | 103014
+ html | null-body | 100296
+ html | blocked-cookie | 88533
+ pdf | | 81517
+ pdf | skip-url-blocklist | 76443
+ html | spn2-backoff | 50615
+ pdf | link-loop | 45516
+ html | wrong-mimetype | 33525
+ html | wayback-content-error | 25535
+ pdf | empty-blob | 21431
+ pdf | redirect-loop | 19795
+ html | petabox-error | 18291
+ html | empty-blob | 14391
+ pdf | wrong-mimetype | 14084
+ html | redirect-loop | 12856
+ xml | success | 10381
+ xml | no-capture | 10008
+ html | skip-url-blocklist | 3294
+ html | cdx-error | 3275
+ (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.
diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md
new file mode 100644
index 0000000..415f23b
--- /dev/null
+++ b/notes/ingest/2022-07_targeted.md
@@ -0,0 +1,140 @@
+
+Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
+
+ export PATCHDATE=2022-07-29
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
+ => COPY 3524573
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ => 3.11M 0:01:08 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 624948 doi.org
+ 382492 www.jstage.jst.go.jp
+ 275087 www.mdpi.com
+ 157134 www.persee.fr
+ 108979 www.sciencedirect.com
+ 94375 www.scielo.br
+ 50834 onlinelibrary.wiley.com
+ 49991 journals.lww.com
+ 30354 www.frontiersin.org
+ 27963 doaj.org
+ 27058 www.e-periodica.ch
+ 24147 dl.acm.org
+ 23389 aclanthology.org
+ 22086 www.research-collection.ethz.ch
+ 21589 medien.die-bonn.de
+ 18866 www.ingentaconnect.com
+ 18583 doi.nrct.go.th
+ 18271 repositories.lib.utexas.edu
+ 17634 hdl.handle.net
+ 16366 archives.datapages.com
+ 15146 cgscholar.com
+ 13987 dl.gi.de
+ 13188 www.degruyter.com
+ 12503 ethos.bl.uk
+ 12304 preprints.jmir.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+ => done
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+
+## Re-Ingest
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
+ => 3.52M 0:01:37 [36.2k/s]
+
+Ingest:
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md
new file mode 100644
index 0000000..ac7c68f
--- /dev/null
+++ b/notes/ingest/2022-09_oaipmh.md
@@ -0,0 +1,397 @@
+
+Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921>
+
+I updated the transform script to block some additional domains.
+
+
+## Prep
+
+Fetch the snapshot:
+
+ cd /srv/sandcrawler/tasks/
+ wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst
+
+Transform to ingest requests:
+
+ cd /srv/sandcrawler/src/python
+ git log | head -n1
+ # commit dfd4605d84712eccb95a63e50b0bcb343642b433
+
+ pipenv shell
+ zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz
+ # 16.1M 1:01:02 [4.38k/s]
+
+Curious about types, though this would probably be handled at fatcat ingest
+time:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt
+
+ head oai_type_counts.txt -n30
+ 5623867 info:eu-repo/semantics/article
+ 5334928 info:eu-repo/semantics/publishedVersion
+ 3870359 text
+ 1240225 Text
+ 829169 Article
+ 769849 NonPeerReviewed
+ 665700 PeerReviewed
+ 648740 Peer-reviewed Article
+ 547857 article
+ 482906 info:eu-repo/semantics/bachelorThesis
+ 353814 Thesis
+ 329269 Student thesis
+ 262650 info:eu-repo/semantics/conferenceObject
+ 185354 Journal articles
+ 162021 info:eu-repo/semantics/doctoralThesis
+ 152079 Journal Article
+ 150226 Research Article
+ 130217 Conference papers
+ 127255 Artículo revisado por pares
+ 124243 Newspaper
+ 123908 ##rt.metadata.pkp.peerReviewed##
+ 123309 Photograph
+ 122981 info:eu-repo/semantics/masterThesis
+ 116719 Book
+ 108946 Image
+ 108216 Report
+ 107946 Other
+ 103562 masterThesis
+ 103038 info:eu-repo/semantics/other
+ 101404 StillImage
+ [...]
+
+And formats:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt
+
+ head -n 20 oai_format_counts.txt
+ 11151928 application/pdf
+ 677413 text
+ 561656 text/html
+ 498518 image/jpeg
+ 231219 Text
+ 193638 text/xml
+ 147214 Image
+ 117073 image/jpg
+ 110872 pdf
+ 91323 image/tiff
+ 76948 bib
+ 75393 application/xml
+ 70244 Digitized from 35 mm. microfilm.
+ 68206 mods
+ 59227 PDF
+ 57677 application/epub+zip
+ 57602 application/octet-stream
+ 52072 text/plain
+ 51620 application/msword
+ 47227 audio/mpeg
+
+Also, just overall size (number of records):
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l
+ # 20,840,301
+
+Next load in to sandcrawler DB:
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request -
+
+ Traceback (most recent call last):
+ File "./persist_tool.py", line 311, in <module>
+ main()
+ File "./persist_tool.py", line 307, in main
+ args.func(args)
+ File "./persist_tool.py", line 119, in run_ingest_request
+ pusher.run()
+ File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run
+ self.worker.push_batch(batch)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values
+ cur.execute(b''.join(parts))
+ psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx"
+ DETAIL: Index row references tuple (6893121,3) in relation "ingest_request".
+ HINT: Values larger than 1/3 of a buffer page cannot be indexed.
+ Consider a function index of an MD5 hash of the value, or use full text indexing.
+ 15.7M 0:41:48 [6.27k/s]
+
+Darn, this means we won't get reasonable stats about how many rows were
+inserted/updated.
+
+Patched the persist tool to skip very long URLs, and ran again (backwards, just
+URLs which didn't get inserted already):
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \
+ | tac \
+ | head -n1000000 \
+ | pv -l \
+ | ./persist_tool.py ingest-request -
+ # 1.00M 0:03:04 [5.41k/s]
+ # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0})
+
+Status of just the new lines:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+---------
+ | 6398455
+ success | 540219
+ no-pdf-link | 41316
+ link-loop | 23871
+ no-capture | 11350
+ redirect-loop | 8315
+ wrong-mimetype | 2394
+ terminal-bad-status | 1540
+ null-body | 1038
+ cdx-error | 272
+ empty-blob | 237
+ petabox-error | 213
+ wayback-error | 186
+ blocked-cookie | 107
+ timeout | 47
+ wayback-content-error | 26
+ spn2-cdx-lookup-failure | 21
+ skip-url-blocklist | 16
+ spn2-backoff | 15
+ body-too-large | 13
+ (20 rows)
+
+
+## Bulk Ingest
+
+Should already have filtered domains/prefixes in transform script, so not
+including filters here.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json';
+ # COPY 6398455
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json
+ # 6.40M 0:02:18 [46.2k/s]
+
+ cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # DONE
+
+Expect this ingest to take a week or so.
+
+Then, run stats again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3617175
+ success | 2775036
+ no-pdf-link | 449298
+ link-loop | 74260
+ terminal-bad-status | 47819
+ wrong-mimetype | 20195
+ redirect-loop | 18197
+ empty-blob | 12127
+ cdx-error | 3038
+ skip-url-blocklist | 2630
+ wayback-error | 2599
+ petabox-error | 2354
+ wayback-content-error | 1617
+ blocked-cookie | 1293
+ null-body | 1038
+ body-too-large | 670
+ | 143
+ bad-gzip-encoding | 64
+ timeout | 47
+ spn2-cdx-lookup-failure | 20
+ (20 rows)
+
+
+## Crawl Seedlist
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'terminal-bad-status'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'timeout'
+ OR ingest_file_result.status = 'wayback-content-error'
+ )
+ ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json';
+ => COPY 3692846
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json
+ => 3.69M 0:01:19 [46.6k/s]
+
+This will be used for re-ingest later. For now, extract URLs:
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | jq .base_url -r \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ => 3.66M 0:00:59 [61.8k/s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | rg '"terminal_url"' \
+ | jq -r .result.terminal_url \
+ | rg -v ^null$ \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ => 0.00 0:00:05 [0.00 /s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | awk '{print "F+ " $1}' \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+What domains are we crawling?
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | sort -u -S 4G \
+ | cut -d/ -f3 \
+ | sort \
+ | uniq -c \
+ | sort -nr \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+
+ head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+ 91899 raco.cat
+ 70116 islandora.wrlc.org
+ 68708 urn.kb.se
+ 63726 citeseerx.ist.psu.edu
+ 50370 publications.rwth-aachen.de
+ 44885 urn.nsk.hr
+ 38429 server15795.contentdm.oclc.org
+ 33041 periodicos.ufpb.br
+ 32519 nbn-resolving.org
+ 31990 www.ajol.info
+ 24745 hal.archives-ouvertes.fr
+ 22569 id.nii.ac.jp
+ 17239 tilburguniversity.on.worldcat.org
+ 15873 dspace.nbuv.gov.ua
+ 15436 digitalcommons.wustl.edu
+ 14885 www.iiste.org
+ 14623 www.manchester.ac.uk
+ 14033 nbn-resolving.de
+ 13999 opus4.kobv.de
+ 13689 www.redalyc.org
+
+Sizes:
+
+ wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ 0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+
+Copy seedlist to crawler:
+
+ # as regular user
+ scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp
+
+## Post-Crawl Bulk Ingest
+
+ # ran 2022-11-16, after crawl cleanup
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -----------------------+---------
+ success | 4721164 +1,946,128
+ no-pdf-link | 1116290
+ no-capture | 673939
+ terminal-bad-status | 232217
+ link-loop | 148544
+ wrong-mimetype | 68841
+ redirect-loop | 26262
+ empty-blob | 17759
+ cdx-error | 6570
+ blocked-cookie | 4026
+ blocked-wall | 3054
+ skip-url-blocklist | 2924
+ body-too-large | 2404
+ bad-redirect | 1565
+ wayback-error | 1320
+ petabox-error | 1083
+ null-body | 1038
+ wayback-content-error | 264
+ bad-gzip-encoding | 150
+ | 143
+ (20 rows)
+
diff --git a/notes/ingest_domains.txt b/notes/ingest_domains.txt
new file mode 100644
index 0000000..ae06272
--- /dev/null
+++ b/notes/ingest_domains.txt
@@ -0,0 +1,294 @@
+
+## Queries to find broken domains
+
+Top domains with failed ingests:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+Status overview for a particular domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT domain, terminal_status_code, COUNT((domain, terminal_status_code))
+ FROM (SELECT terminal_status_code, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ AND t1.terminal_status_code is not null
+ GROUP BY domain, terminal_status_code
+ ORDER BY COUNT DESC;
+
+Sample recent failures:
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%osapublishing.org%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+
+## Failing
+
+www.osapublishing.org
+
+ this publisher (The Optical Society) is systemically using a CAPTCHA to
+ gate access to PDFs. bummer! could ask them to white-list?
+
+ has citation_pdf_url, so that isn't an issue
+
+ status: "no-pdf-link"
+ hops:
+ "https://doi.org/10.1364/optica.6.000798",
+ "https://www.osapublishing.org/viewmedia.cfm?uri=optica-6-6-798&seq=0"
+ "https://www.osapublishing.org/captcha/?guid=830CEAB5-09BD-6140-EABD-751200C78B1C"
+
+ domain | status | count
+ -----------------------+---------------------+-------
+ www.osapublishing.org | no-capture | 16680
+ www.osapublishing.org | no-pdf-link | 373
+ www.osapublishing.org | redirect-loop | 19
+ www.osapublishing.org | terminal-bad-status | 5
+ www.osapublishing.org | cdx-error | 1
+ www.osapublishing.org | wrong-mimetype | 1
+ www.osapublishing.org | spn-error | 1
+ www.osapublishing.org | success | 1
+ www.osapublishing.org | wayback-error | 1
+ (9 rows)
+
+www.persee.fr
+
+ Seems to be mostly blocking or rate-limiting?
+
+ domain | status | count
+ ---------------+-------------------------------------+-------
+ www.persee.fr | no-capture | 37862
+ www.persee.fr | terminal-bad-status | 3134
+ www.persee.fr | gateway-timeout | 2828
+ www.persee.fr | no-pdf-link | 431
+ www.persee.fr | spn-error | 75
+ www.persee.fr | redirect-loop | 23
+ www.persee.fr | success | 8
+ www.persee.fr | spn2-error | 2
+ www.persee.fr | spn2-error:soft-time-limit-exceeded | 1
+ www.persee.fr | wrong-mimetype | 1
+ (10 rows)
+
+journals.openedition.org
+
+ PDF access is via "freemium" subscription. Get redirects to:
+
+ https://auth.openedition.org/authorized_ip?url=http%3A%2F%2Fjournals.openedition.org%2Fnuevomundo%2Fpdf%2F61053
+
+ Content is technically open access (HTML and license; for all content?),
+ but can't be crawled as PDF without subscription.
+
+ domain | status | count
+ --------------------------+-------------------------+-------
+ journals.openedition.org | redirect-loop | 29587
+ journals.openedition.org | success | 6821
+ journals.openedition.org | no-pdf-link | 1507
+ journals.openedition.org | no-capture | 412
+ journals.openedition.org | wayback-error | 32
+ journals.openedition.org | wrong-mimetype | 27
+ journals.openedition.org | terminal-bad-status | 13
+ journals.openedition.org | spn2-cdx-lookup-failure | 4
+ journals.openedition.org | spn-remote-error | 1
+ journals.openedition.org | null-body | 1
+ journals.openedition.org | cdx-error | 1
+ (11 rows)
+
+journals.lww.com
+
+ no-pdf-link
+
+ domain | status | count
+ ------------------+----------------+-------
+ journals.lww.com | no-pdf-link | 11668
+ journals.lww.com | wrong-mimetype | 131
+ (2 rows)
+
+ doi prefix: 10.1097
+
+ <meta name="wkhealth_pdf_url" content="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf" />
+ data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+
+ Some weird thing going on, maybe they are blocking-via-redirect based on
+ our User-Agent? Seems like wget works, so funny that they don't block that.
+
+musewide.aip.de
+
+ no-pdf-link
+
+koreascience.or.kr | no-pdf-link | 8867
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%osapublishing.org%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+www.cairn.info | link-loop | 8717
+
+easy.dans.knaw.nl | no-pdf-link | 8262
+scielo.conicyt.cl | no-pdf-link | 7925
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'scielo.conicyt.cl'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%scielo.conicyt.cl%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+
+ domain | status | count
+ -------------------+---------------------+-------
+ scielo.conicyt.cl | no-pdf-link | 7926
+ scielo.conicyt.cl | success | 4972
+ scielo.conicyt.cl | terminal-bad-status | 1474
+ scielo.conicyt.cl | wrong-mimetype | 6
+ scielo.conicyt.cl | no-capture | 4
+ scielo.conicyt.cl | null-body | 1
+
+
+ pdf | https://doi.org/10.4067/s0370-41061980000300002 | 2020-02-22 23:55:56.235822+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0370-41061980000300002&lng=en&nrm=iso&tlng=en | 20200212201727 | 200 |
+ pdf | https://doi.org/10.4067/s0718-221x2019005000201 | 2020-02-22 23:01:49.070104+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-221X2019005000201&lng=en&nrm=iso&tlng=en | 20200214105308 | 200 |
+ pdf | https://doi.org/10.4067/s0717-75262011000200002 | 2020-02-22 22:49:36.429717+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-75262011000200002&lng=en&nrm=iso&tlng=en | 20200211205804 | 200 |
+ pdf | https://doi.org/10.4067/s0717-95022006000400029 | 2020-02-22 22:33:07.761766+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-95022006000400029&lng=en&nrm=iso&tlng=en | 20200209044048 | 200 |
+
+ These seem, on retry, like success? Maybe previous was a matter of warc/revisit not getting handled correctly?
+
+ pdf | https://doi.org/10.4067/s0250-71611998007100009 | 2020-02-22 23:57:16.481703+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0250-71611998007100009&lng=en&nrm=iso&tlng=en | 20200212122939 | 200 |
+ pdf | https://doi.org/10.4067/s0716-27902005020300006 | 2020-02-22 23:56:01.247616+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0716-27902005020300006&lng=en&nrm=iso&tlng=en | 20200214192151 | 200 |
+ pdf | https://doi.org/10.4067/s0718-23762005000100015 | 2020-02-22 23:53:55.81526+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-23762005000100015&lng=en&nrm=iso&tlng=en | 20200214173237 | 200 |
+
+ Look like web/xml only.
+
+ TODO: XML ingest (and replay?) support. These are as "<article>", not sure if that is JATS or what.
+
+www.kci.go.kr | no-pdf-link | 6842
+www.m-hikari.com | no-pdf-link | 6763
+cshprotocols.cshlp.org | no-pdf-link | 6553
+www.bibliotekevirtual.org | no-pdf-link | 6309
+data.hpc.imperial.ac.uk | no-pdf-link | 6071
+projecteuclid.org | link-loop | 5970
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'projecteuclid.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%projecteuclid.org%'
+ AND status = 'link-loop'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ -------------------+-------------------------+-------
+ projecteuclid.org | link-loop | 5985
+ projecteuclid.org | success | 26
+ projecteuclid.org | wayback-error | 26
+ projecteuclid.org | wrong-mimetype | 17
+ projecteuclid.org | spn2-cdx-lookup-failure | 4
+ projecteuclid.org | other-mimetype | 4
+ projecteuclid.org | no-capture | 3
+ projecteuclid.org | terminal-bad-status | 2
+ projecteuclid.org | spn2-error:job-failed | 1
+ projecteuclid.org | spn-remote-error | 1
+ (10 rows)
+
+ Doing a cookie check and redirect.
+
+ TODO: brozzler behavior to "click the link" instead?
+
+www.scielo.br | no-pdf-link | 5823
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'www.scielo.br'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%www.scielo.br%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ ---------------+-------------------------+-------
+ www.scielo.br | success | 35150
+ www.scielo.br | no-pdf-link | 5839
+ www.scielo.br | terminal-bad-status | 429
+ www.scielo.br | no-capture | 189
+ www.scielo.br | wrong-mimetype | 7
+ www.scielo.br | spn2-cdx-lookup-failure | 2
+ (6 rows)
+
+ Seems to just be the subset with no PDFs.
+
+get.iedadata.org | no-pdf-link | 5822
+www.pdcnet.org | no-pdf-link | 5798
+publications.rwth-aachen.de | no-pdf-link | 5323
+www.sciencedomain.org | no-pdf-link | 5231
+medicalforum.ch | terminal-bad-status | 4574
+jrnl.nau.edu.ua | link-loop | 4145
+ojs.academypublisher.com | no-pdf-link | 4017
+
+## MAG bulk ingest
+
+- dialnet.unirioja.es | redirect-loop | 240967
+ dialnet.unirioja.es | terminal-bad-status | 20320
+ => may be worth re-crawling via heritrix?
+- agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639
+ => and other *.onlinelibrary.wiley.com
+- www.researchgate.net | redirect-loop | 42859
+- www.redalyc.org:9081 | no-pdf-link | 10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+- papers.ssrn.com | redirect-loop | 27328
+ => blocking is pretty aggressive, using cookies or referrer or something.
+ maybe a brozzler behavior would work, but doesn't currently
+
+## Out of Scope
+
+Datasets only?
+
+- plutof.ut.ee
+- www.gbif.org
+- doi.pangaea.de
+- www.plate-archive.org
+
+Historical non-paper content:
+
+- dhz.uni-passau.de (newspapers)
+- digital.ucd.ie (irish historical)
+
+Mostly datasets (some PDF content):
+
+- *.figshare.com
+- zenodo.com
+- data.mendeley.com
diff --git a/notes/possible_ingest_targets.txt b/notes/possible_ingest_targets.txt
new file mode 100644
index 0000000..fcdc3e4
--- /dev/null
+++ b/notes/possible_ingest_targets.txt
@@ -0,0 +1,15 @@
+
+- all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
index 52a3264..cd8176e 100644
--- a/notes/tasks/2021-09-09_pdf_url_lists.md
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -64,3 +64,7 @@ ingest_file_result table, pdf, success: 66,487,928
"Parsed web PDFs": `file_meta`, left join CDX
(didn't do this one)
+
+---
+
+Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09>
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
index 65e9fe3..5fb69d1 100644
--- a/notes/tasks/2021-12-06_regrobid.md
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -191,6 +191,84 @@ And some earlier files of interest on `aitio`:
| pv -l \
| kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+## Ancient Fatcat Files
+
+Files from an era where we didn't record GROBID version or status, even for
+success.
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.status_code = 200
+ AND grobid.status IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json'
+ WITH NULL '';
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 107k 0:00:03 [29.9k/s]
+
+
+## Start Re-Processing Old GROBID Versions
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.status = 'success'
+ AND grobid.grobid_version NOT LIKE '0.7.%'
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json'
+ WITH NULL '';
+
+This one is huge, and want to process in batches/chunks of ~8 million at a time.
+
+ cd /srv/sandcrawler/tasks/
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \
+ | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json
+
+Submit individual batches like:
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Overall progress:
+
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_00.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_01.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_02.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_03.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_04.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_05.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_06.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_07.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small)
+
+This finally finished on 2022-04-26. Horray!
+
## General Counts
How many fatcat files of what mimetype (reported in sandcrawler-db)?
@@ -287,3 +365,16 @@ What are the GROBID status codes for fatcat files? Narrowed down:
error | 200 | 3
(7 rows)
+Ran the same query again on 2021-12-15:
+
+ status | status_code | count
+ ----------------+-------------+----------
+ success | 200 | 45092915
+ error | 500 | 302373
+ | | 250335
+ | 200 | 53352
+ bad-grobid-xml | 200 | 39
+ error-timeout | -4 | 37
+ error | 200 | 34
+ error | 503 | 2
+ (8 rows)
diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
new file mode 100644
index 0000000..b5422c2
--- /dev/null
+++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
@@ -0,0 +1,23 @@
+
+Martin crawled more than 10 million new PDFs from various platform domains. We
+should get these processed and included in sandcrawler-db.
+
+## Select CDX Rows
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM cdx
+ LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
+ WITH NULL '';
+ => COPY 8801527
+
+ cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+ # for pdfextract, would be: sandcrawler-prod.unextracted
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md
new file mode 100644
index 0000000..c727a57
--- /dev/null
+++ b/notes/tasks/2022-03-07_ukraine_firedrill.md
@@ -0,0 +1,225 @@
+
+Want to do priority crawling of Ukranian web content, plus Russia and Belarus.
+
+
+## What is Missing?
+
+ (country_code:ua OR lang:uk)
+ => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA
+ later in day, already some 22k missing found! wow
+ => 2022-04-04, after ingests: 476,174 total, 131,063 missing, 49k OA missing
+
+## Metadata Prep
+
+- container metadata update (no code changes)
+ x wikidata SPARQL update
+ x chocula run
+ x journal metadata update (fatcat)
+ x update journal stats (fatcat extra)
+- DOAJ article metadata import
+ x prep and upload single JSON file
+
+
+## Journal Homepage URL Crawl
+
+x dump ukraine-related journal homepages from chocula DB
+x create crawl config
+x start crawl
+x repeat for belarus and russia
+
+
+ python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv
+
+sqlite3:
+
+ select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi
+ 1952
+
+ SELECT COUNT(*) FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+ => 1970
+
+ .mode csv
+ .once homepage_urls_ukraine.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+
+ .mode csv
+ .once homepage_urls_russia.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ru'
+ OR journal.lang = 'ru'
+ OR journal.name like '%russ%'
+ OR journal.publisher like '%russ%';
+
+ .mode csv
+ .once homepage_urls_belarus.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'by'
+ OR journal.lang = 'be'
+ OR journal.name like '%belarus%'
+ OR journal.publisher like '%belarus%';
+
+ cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+ 1971 homepage_urls_ukraine.tsv
+ 3482 homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv
+ 3728 homepage_urls_russia.tsv
+ 2420 homepage_urls.2022-03-08.ru_tld.tsv
+ 6030 homepage_urls_russia_combined.2022-03-08.tsv
+
+
+ cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv
+ 138 homepage_urls_belarus.tsv
+ 85 homepage_urls.2022-03-08.by_tld.tsv
+ 222 homepage_urls_belarus_combined.2022-03-08.tsv
+
+
+## Landing Page Crawl
+
+x create crawl config
+x fatcat ingest query for related URLs
+ => special request code/label?
+x finish .by and .ru article URL dump, start crawling
+x URL list filtered from new OAI-PMH feed
+ => do we need to do full bulk load/dump, or not?
+- URL list from partner (google)
+- do we need to do alternative thing of iterating over containers, ingesting each?
+
+ ./fatcat_ingest.py --env prod \
+ --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk"
+
+ # around Tue 08 Mar 2022 01:07:37 PM PST
+ # Expecting 185659 release objects in search queries
+ # didn't complete successfully? hrm
+
+ # ok, retry "manually" (with kafkacat)
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json
+ # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318})
+ # 103k 0:25:04 [68.7 /s]
+
+ zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz
+ # 103k 0:00:02 [38.1k/s]
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:by OR lang:be" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz
+ # Expecting 2266 release objects in search queries
+ # 1.29k 0:00:34 [37.5 /s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ru OR lang:ru" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz
+ # Expecting 1515246 release objects in search queries
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz
+
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt
+ # 309k 0:00:03 [81.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt
+ # 71.2k 0:00:03 [19.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt
+ # 276k 0:00:03 [72.9k/s]
+
+
+### Landing Page Bulk Ingest
+
+Running these 2022-03-24, after targeted crawl completed:
+
+ zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 103k 0:00:02 [36.1k/s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 1.29k 0:00:00 [15.8k/s]
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 546k 0:00:13 [40.6k/s]
+
+It will probably take a week or more for these to complete.
+
+
+## Outreach
+
+- openalex
+- sucho.org
+- ceeol.com
diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md
new file mode 100644
index 0000000..273ff32
--- /dev/null
+++ b/notes/tasks/2022-04-27_pdf_url_lists.md
@@ -0,0 +1,72 @@
+
+Another dump of PDF URLs for partners. This time want to provide TSV with full
+wayback download URLs, as well as "access" URLs.
+
+ export TASKDATE=2022-04-27
+
+## "Ingested", AKA, "Targetted" PDF URLs
+
+These are URLs where we did a successful ingest run.
+
+ COPY (
+ SELECT
+ terminal_sha1hex as pdf_sha1hex,
+ ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url,
+ ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ AND hit = true
+ ORDER BY terminal_sha1hex ASC
+ -- LIMIT 10;
+ )
+ TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv'
+ WITH NULL '';
+ => COPY 85712674
+
+May contain duplicates, both by sha1hex, URL, or both.
+
+Note that this could be filtered by timestamp, to make it monthly/annual.
+
+
+## All CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.sha1hex as pdf_sha1hex,
+ ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url,
+ ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ ORDER BY cdx.sha1hex ASC
+ -- LIMIT 10;
+ )
+ TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv'
+ WITH NULL '';
+ => COPY 161504070
+
+Should be unique by wayback URL; may contain near-duplicates or duplicates by
+
+## Upload to archive.org
+
+TODO: next time compress these files first (gzip/pigz)
+
+ia upload ia_scholarly_urls_$TASKDATE \
+ -m collection:ia_biblio_metadata \
+ -m title:"IA Scholarly URLs ($TASKDATE)" \
+ -m date:$TASKDATE \
+ -m creator:"Internet Archive Web Group" \
+ -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \
+ /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv
+
diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
new file mode 100644
index 0000000..74d3857
--- /dev/null
+++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
@@ -0,0 +1,132 @@
+
+Had a huge number of SPN requests for the andrzejklimczuk.com domain,
+presumably from the author.
+
+Many were duplicates (same file, multiple releases, often things like zenodo
+duplication). Many were also GROBID 500s, due to truncated common crawl
+captures.
+
+Needed to cleanup! Basically sorted through a few editgroups manually, then
+rejected all the rest and manually re-submitted with the below queries and
+commands:
+
+ SELECT COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
+ => 589
+
+ SELECT ingest_file_result.status, COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ GROUP BY ingest_file_result.status;
+
+ status | count
+ ----------------+-------
+ cdx-error | 1
+ success | 587
+ wrong-mimetype | 1
+ (3 rows)
+
+
+ SELECT grobid.status_code, COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ GROUP BY grobid.status_code;
+
+ status_code | count
+ -------------+-------
+ 200 | 385
+ 500 | 202
+ | 2
+ (3 rows)
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ AND ingest_file_result.status = 'success'
+ AND grobid.status_code = 500
+ ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
+ => COPY 202
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ AND ingest_file_result.status = 'success'
+ AND grobid.status_code = 200
+ ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
+ => COPY 385
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+ | jq '. + {force_recrawl: true}' -c \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+ | shuf \
+ | head -n100 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+ | shuf \
+ | head -n10000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
diff --git a/please b/please
index 298a1c5..74e9766 100755
--- a/please
+++ b/please
@@ -12,7 +12,7 @@ import subprocess
from datetime import datetime
HDFS_DIR = "hdfs:///user/bnewbold/sandcrawler"
-HBASE_HOST = "wbgrp-svc263.us.archive.org"
+HBASE_HOST = "wbgrp-svc350.us.archive.org"
ZOOKEEPER_HOSTS = "mtrcs-zk1.us.archive.org:2181"
GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070"
diff --git a/sandcrawler-rfc.md b/proposals/2018_original_sandcrawler_rfc.md
index ecf7ab8..ecf7ab8 100644
--- a/sandcrawler-rfc.md
+++ b/proposals/2018_original_sandcrawler_rfc.md
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
index c05c9df..768784f 100644
--- a/proposals/2019_ingest.md
+++ b/proposals/2019_ingest.md
@@ -1,5 +1,5 @@
-status: work-in-progress
+status: deployed
This document proposes structure and systems for ingesting (crawling) paper
PDFs and other content as part of sandcrawler.
diff --git a/proposals/20200129_pdf_ingest.md b/proposals/20200129_pdf_ingest.md
index 620ed09..157607e 100644
--- a/proposals/20200129_pdf_ingest.md
+++ b/proposals/20200129_pdf_ingest.md
@@ -1,5 +1,5 @@
-status: planned
+status: deployed
2020q1 Fulltext PDF Ingest Plan
===================================
diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md
index 31a2db6..6f6443f 100644
--- a/proposals/20200207_pdftrio.md
+++ b/proposals/20200207_pdftrio.md
@@ -1,5 +1,8 @@
-status: in progress
+status: deployed
+
+NOTE: while this has been used in production, as of December 2022 the results
+are not used much in practice, and we don't score every PDF that comes along
PDF Trio (ML Classification)
==============================
diff --git a/proposals/20201012_no_capture.md b/proposals/20201012_no_capture.md
index 27c14d1..7f6a1f5 100644
--- a/proposals/20201012_no_capture.md
+++ b/proposals/20201012_no_capture.md
@@ -1,5 +1,8 @@
-status: in-progress
+status: work-in-progress
+
+NOTE: as of December 2022, bnewbold can't remember if this was fully
+implemented or not.
Storing no-capture missing URLs in `terminal_url`
=================================================
diff --git a/proposals/20201103_xml_ingest.md b/proposals/20201103_xml_ingest.md
index 25ec973..34e00b0 100644
--- a/proposals/20201103_xml_ingest.md
+++ b/proposals/20201103_xml_ingest.md
@@ -1,22 +1,5 @@
-status: wip
-
-TODO:
-x XML fulltext URL extractor (based on HTML biblio metadata, not PDF url extractor)
-x differential JATS XML and scielo XML from generic XML?
- application/xml+jats is what fatcat is doing for abstracts
- but it should be application/jats+xml?
- application/tei+xml
- if startswith "<article " and "<article-meta>" => JATS
-x refactor ingest worker to be more general
-x have ingest code publish body to kafka topic
-x write a persist worker
-/ create/configure kafka topic
-- test everything locally
-- fatcat: ingest tool to create requests
-- fatcat: entity updates worker creates XML ingest requests for specific sources
-- fatcat: ingest file import worker allows XML results
-- ansible: deployment of persist worker
+status: deployed
XML Fulltext Ingest
====================
diff --git a/proposals/2020_pdf_meta_thumbnails.md b/proposals/2020_pdf_meta_thumbnails.md
index f231a7f..141ece8 100644
--- a/proposals/2020_pdf_meta_thumbnails.md
+++ b/proposals/2020_pdf_meta_thumbnails.md
@@ -1,5 +1,5 @@
-status: work-in-progress
+status: deployed
New PDF derivatives: thumbnails, metadata, raw text
===================================================
diff --git a/proposals/2021-04-22_crossref_db.md b/proposals/2021-04-22_crossref_db.md
index bead7a4..1d4c3f8 100644
--- a/proposals/2021-04-22_crossref_db.md
+++ b/proposals/2021-04-22_crossref_db.md
@@ -1,5 +1,5 @@
-status: work-in-progress
+status: deployed
Crossref DOI Metadata in Sandcrawler DB
=======================================
diff --git a/proposals/2021-12-09_trawling.md b/proposals/2021-12-09_trawling.md
new file mode 100644
index 0000000..33b6b4c
--- /dev/null
+++ b/proposals/2021-12-09_trawling.md
@@ -0,0 +1,180 @@
+
+status: work-in-progress
+
+NOTE: as of December 2022, the implementation on these features haven't been
+merged to the main branch. Development stalled in December 2021.
+
+Trawling for Unstructured Scholarly Web Content
+===============================================
+
+## Background and Motivation
+
+A long-term goal for sandcrawler has been the ability to pick through
+unstructured web archive content (or even non-web collection), identify
+potential in-scope research outputs, extract metadata for those outputs, and
+merge the content in to a catalog (fatcat).
+
+This process requires integration of many existing tools (HTML and PDF
+extraction; fuzzy bibliographic metadata matching; machine learning to identify
+in-scope content; etc), as well as high-level curration, targetting, and
+evaluation by human operators. The goal is to augment and improve the
+productivity of human operators as much as possible.
+
+This process will be similar to "ingest", which is where we start with a
+specific URL and have some additional context about the expected result (eg,
+content type, exernal identifier). Some differences with trawling are that we
+are start with a collection or context (instead of single URL); have little or
+no context about the content we are looking for; and may even be creating a new
+catalog entry, as opposed to matching to a known existing entry.
+
+
+## Architecture
+
+The core operation is to take a resource and run a flowchart of processing
+steps on it, resulting in an overall status and possible related metadata. The
+common case is that the resource is a PDF or HTML coming from wayback (with
+contextual metadata about the capture), but we should be flexible to supporting
+more content types in the future, and should try to support plain files with no
+context as well.
+
+Some relatively simple wrapper code handles fetching resources and summarizing
+status/counts.
+
+Outside of the scope of sandcrawler, new fatcat code (importer or similar) will
+be needed to handle trawl results. It will probably make sense to pre-filter
+(with `jq` or `rg`) before passing results to fatcat.
+
+At this stage, trawl workers will probably be run manually. Some successful
+outputs (like GROBID, HTML metadata) would be written to existing kafka topics
+to be persisted, but there would not be any specific `trawl` SQL tables or
+automation.
+
+It will probably be helpful to have some kind of wrapper script that can run
+sandcrawler trawl processes, then filter and pipe the output into fatcat
+importer, all from a single invocation, while reporting results.
+
+TODO:
+- for HTML imports, do we fetch the full webcapture stuff and return that?
+
+
+## Methods of Operation
+
+### `cdx_file`
+
+An existing CDX file is provided on-disk locally.
+
+### `cdx_api`
+
+Simplified variants: `cdx_domain`, `cdx_surt`
+
+Uses CDX API to download records matching the configured filters, then processes the file.
+
+Saves the CDX file intermediate result somewhere locally (working or tmp
+directory), with timestamp in the path, to make re-trying with `cdx_file` fast
+and easy.
+
+
+### `archiveorg_web_collection`
+
+Uses `cdx_collection.py` (or similar) to fetch a full CDX list, by iterating over
+then process it.
+
+Saves the CDX file intermediate result somewhere locally (working or tmp
+directory), with timestamp in the path, to make re-trying with `cdx_file` fast
+and easy.
+
+### Others
+
+- `archiveorg_file_collection`: fetch file list via archive.org metadata, then processes each
+
+## Schema
+
+Per-resource results:
+
+ hit (bool)
+ indicates whether resource seems in scope and was processed successfully
+ (roughly, status 'success', and
+ status (str)
+ success: fetched resource, ran processing, pa
+ skip-cdx: filtered before even fetching resource
+ skip-resource: filtered after fetching resource
+ wayback-error (etc): problem fetching
+ content_scope (str)
+ filtered-{filtertype}
+ article (etc)
+ landing-page
+ resource_type (str)
+ pdf, html
+ file_meta{}
+ cdx{}
+ revisit_cdx{}
+
+ # below are resource_type specific
+ grobid
+ pdf_meta
+ pdf_trio
+ html_biblio
+ (other heuristics and ML)
+
+High-level request:
+
+ trawl_method: str
+ cdx_file_path
+ default_filters: bool
+ resource_filters[]
+ scope: str
+ surt_prefix, domain, host, mimetype, size, datetime, resource_type, http_status
+ value: any
+ values[]: any
+ min: any
+ max: any
+ biblio_context{}: set of expected/default values
+ container_id
+ release_type
+ release_stage
+ url_rel
+
+High-level summary / results:
+
+ status
+ request{}: the entire request object
+ counts
+ total_resources
+ status{}
+ content_scope{}
+ resource_type{}
+
+## Example Corpuses
+
+All PDFs (`application/pdf`) in web.archive.org from before the year 2000.
+Starting point would be a CDX list.
+
+Spidering crawls starting from a set of OA journal homepage URLs.
+
+Archive-It partner collections from research universities, particularly of
+their own .edu domains. Starting point would be an archive.org collection, from
+which WARC files or CDX lists can be accessed.
+
+General archive.org PDF collections, such as
+[ERIC](https://archive.org/details/ericarchive) or
+[Document Cloud](https://archive.org/details/documentcloud).
+
+Specific Journal or Publisher URL patterns. Starting point could be a domain,
+hostname, SURT prefix, and/or URL regex.
+
+Heuristic patterns over full web.archive.org CDX index. For example, .edu
+domains with user directories and a `.pdf` in the file path ("tilde" username
+pattern).
+
+Random samples of entire Wayback corpus. For example, random samples filtered
+by date, content type, TLD, etc. This would be true "trawling" over the entire
+corpus.
+
+
+## Other Ideas
+
+Could have a web archive spidering mode: starting from a seed, fetch multiple
+captures (different captures), then extract outlinks from those, up to some
+number of hops. An example application would be links to research group
+webpages or author homepages, and to try to extract PDF links from CVs, etc.
+
diff --git a/proposals/brainstorm/2021-debug_web_interface.md b/proposals/brainstorm/2021-debug_web_interface.md
new file mode 100644
index 0000000..442b439
--- /dev/null
+++ b/proposals/brainstorm/2021-debug_web_interface.md
@@ -0,0 +1,9 @@
+
+status: brainstorm idea
+
+Simple internal-only web interface to help debug ingest issues.
+
+- paste a hash, URL, or identifier and get a display of "everything we know" about it
+- enter a URL/SURT prefix and get aggregate stats (?)
+- enter a domain/host/prefix and get recent attempts/results
+- pre-computed periodic reports on ingest pipeline (?)
diff --git a/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md b/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md
new file mode 100644
index 0000000..b3ad447
--- /dev/null
+++ b/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md
@@ -0,0 +1,36 @@
+
+status: brainstorming
+
+We continue to see issues with heritrix3-based crawling. Would like to have an
+option to switch to higher-throughput heritrix-based crawling.
+
+SPNv2 path would stick around at least for save-paper-now style ingest.
+
+
+## Sketch
+
+Ingest requests are created continuously by fatcat, with daily spikes.
+
+Ingest workers run mostly in "bulk" mode, aka they don't make SPNv2 calls.
+`no-capture` responses are recorded in sandcrawler SQL database.
+
+Periodically (daily?), a script queries for new no-capture results, filtered to
+the most recent period. These are processed in a bit in to a URL list, then
+converted to a heritrix frontier, and sent to crawlers. This could either be an
+h3 instance (?), or simple `scp` to a running crawl directory.
+
+The crawler crawls, with usual landing page config, and draintasker runs.
+
+TODO: can we have draintasker/heritrix set a maximum WARC life? Like 6 hours?
+or, target a smaller draintasker item size, so they get updated more frequently
+
+Another SQL script dumps ingest requests from the *previous* period, and
+re-submits them for bulk-style ingest (by workers).
+
+The end result would be things getting crawled and updated within a couple
+days.
+
+
+## Sketch 2
+
+Upload URL list to petabox item, wait for heritrix derive to run (!)
diff --git a/python/.flake8 b/python/.flake8
index 9c9aabe..c7ef5fe 100644
--- a/python/.flake8
+++ b/python/.flake8
@@ -3,11 +3,12 @@ select = C,E,F,W,ANN
# ANN003 is annotation on, eg, **kwargs
# ANN101 is annotation on 'self' (why would that be wanted?)
# ANN204 is annotation on '__init__()'
+# ANN401 is 'Any' type
# E265,E266 are restrictions on comments ('#')
# E501 is line-too-long, which we enforce with black
# W503,E203 are allowed by black
# TODO: C901 is complexity, should be re-enabled at some point
-ignore = ANN003,ANN101,ANN204,E265,E266,E501,C901,W503,E203
+ignore = ANN003,ANN101,ANN204,ANN401,E265,E266,E501,C901,W503,E203
per-file-ignores =
sandcrawler/__init__.py: F401
sandcrawler/ia.py: E402
diff --git a/python/Pipfile b/python/Pipfile
index 36faed0..b841755 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -1,6 +1,6 @@
[[source]]
name = "ia"
-url = "https://devpi.archive.org/wb/prod"
+url = "https://devpi.us.archive.org/wb/prod"
verify_ssl = true
[[source]]
@@ -26,14 +26,10 @@ types-beautifulsoup4 = "*"
types-dateparser = "*"
types-psycopg2 = "*"
types-Pillow = "*"
-
-# must lock black to an exact version because it is still "beta"
-# see: https://github.com/psf/black/issues/517
-black = "==21.9b0"
+black = "*"
[packages]
requests = ">=2"
-raven = {extras = ['flask'],version = "*"}
confluent-kafka = "*"
python-snappy = "*"
boto3 = "*"
@@ -43,12 +39,14 @@ bs4 = "*"
python-magic = "*"
ftfy = "*"
internetarchive = "*"
-Flask = ">=1"
urlcanon = "*"
Pillow = ">=3"
python-poppler = ">=0.2.1"
selectolax = ">=0.2"
-trafilatura = ">=1"
+# constraining trafilatura to prevent a version conflict with
+# `charset_normalizer`, between htmldate and requests
+trafilatura = ">=1,<1.4"
+htmldate= ">=1,<1.4"
pydantic = ">=1.7"
dateparser = "*"
braveblock = "*"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index a79afcc..546a420 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "a4fff1d3758dc92d8f607c9ff489fbe42b0b4e6a379d2d91688a768a187f0b2b"
+ "sha256": "35d0f0cd2f3903cce19d5a73f50a89ba09a1b43abbda84894fd45411d7f32760"
},
"pipfile-spec": 6,
"requires": {
@@ -10,7 +10,7 @@
"sources": [
{
"name": "ia",
- "url": "https://devpi.archive.org/wb/prod",
+ "url": "https://devpi.us.archive.org/wb/prod",
"verify_ssl": true
},
{
@@ -21,6 +21,14 @@
]
},
"default": {
+ "async-timeout": {
+ "hashes": [
+ "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15",
+ "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.0.2"
+ },
"backports.zoneinfo": {
"hashes": [
"sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
@@ -40,57 +48,56 @@
"sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
"sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
],
- "markers": "python_version >= '3.6' and python_version < '3.9'",
+ "markers": "python_version < '3.9' and python_version >= '3.6' and python_version < '3.9'",
"version": "==0.2.1"
},
"beautifulsoup4": {
"hashes": [
- "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
- "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"
+ "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
+ "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
],
- "version": "==4.10.0"
- },
- "blinker": {
- "hashes": [
- "sha256:471aee25f3992bd325afa3772f1063dbdbbca947a041b8b89466dc00d606f8b6"
- ],
- "version": "==1.4"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.11.1"
},
"boto3": {
"hashes": [
- "sha256:76b3ee0d1dd860c9218bc864cd29f1ee986f6e1e75e8669725dd3c411039379e",
- "sha256:c39cb6ed376ba1d4689ac8f6759a2b2d8a0b0424dbec0cd3af1558079bcf06e8"
+ "sha256:7a6766c7177a9c6f85365e02aabd96ca4d72e08bc5cb127cb51b0a97ac9b9d1b",
+ "sha256:82b790b1dabd0746b028d2013b5d4d636a41f3aaf25520081f4c173cb6eb395d"
],
"index": "ia",
- "version": "==1.20.23"
+ "version": "==1.26.37"
},
"botocore": {
"hashes": [
- "sha256:640b62110aa6d1c25553eceafb5bcd89aedeb84b191598d1f6492ad24374d285",
- "sha256:7459766c4594f3b8877e8013f93f0dc6c6486acbeb7d9c9ae488396529cc2e84"
+ "sha256:18ab8e95345a6d0d2653ce65d261a0aef6fef8a57a35a89e3cea6ffe315e92fc",
+ "sha256:3afa4fec9f7713caa05116563b38f81bec7bd20585d517155484d3f25efab5aa"
],
- "version": "==1.23.23"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.29.37"
},
"braveblock": {
"hashes": [
- "sha256:08b671719a188e34a74b1a896637097e1f9baf5f9c248a18696b497ceb4e7a5d",
- "sha256:27391b6b7a5de45bcaa550c44fa66294ff84d9ca71ced1ed08eb1866262a11de",
- "sha256:451d3b4ccdecbf793a9f364abbd54e0ee83bb1a977e95ef5aa63281d1c9062d2",
- "sha256:5713a754d1a9f90e7ed33683dd81cec699c9e445441a694ca5dfb87c8862a113",
- "sha256:5ac3ddeee982719183cbcaa1ea8b09fa13af7d59874c0454ac5a152ca15da751",
- "sha256:758fde5e67983ef621133fa7661045e730fe15380420cddcd2fd94eb1bea27ae",
- "sha256:8b11036fcda47b65db2f398bdb46da51fec7349e8717b10e7ebf0a1ef048ee94",
- "sha256:aa508a376bd76cf63bf3202dc5911e7ffafe865d06adf77ebcfb557c7724d9d6",
- "sha256:adf69ed16707bbc626b18a439e71b56ff9f1d5b5cf93da3df2c37990244f9875",
- "sha256:be317cc407c7a26873545cd4614266cc7ff3addbeae0375bdfbd1e1461061ab5",
- "sha256:cfc9bb01de107dd762464daebd3d28114a09134f2885ad788f95b382893df40c",
- "sha256:e4a00528eefb94d7f7c4602ca196b98d894a33b5c775c6e8c82257af78c481d9"
+ "sha256:0bfca14473275366f2f822751c4e8dde7f94ee5ce8a9372244870452458f4fe1",
+ "sha256:107050b2e1c885b748727573a54a85d2e1ea9ad86146370f6eb79ca18b9673d4",
+ "sha256:13f9769eac9c4027eba2f400e635572796f7a7feb343f442d13c4b78e7d6f536",
+ "sha256:14efeada36418525da7c3b26393041b85242ffa1165328ec7eaf9b9780b72d62",
+ "sha256:1ab6980d10b8a02fd0dc73e28f18a0a3e17be636d314c1fdaa3bbb3e36a81f0f",
+ "sha256:45286418a43a3dfab50bdaf922f5003dbd2c3d1f696d23883568f4fa14b8093e",
+ "sha256:66c2442154102bff8df9c6f05cb72cd5cda6f4e1ed88592800ab1b6e8100e806",
+ "sha256:73de4f925ae5442d3361a71d7c0eeb1b4c540bf3d0c91100a00325ccef9e743c",
+ "sha256:80cbeeb6d083bc2a9106214188e5ce05362f248c1051344dc6673b7b38a561da",
+ "sha256:8460b10c9b82cc9d0b6056e1fe206bea209fe5a83ba87bdf9486305657224a44",
+ "sha256:903c506fc05eb6b76e4d31f957c1118078582db80f8ef5ce5ac74418f094d498",
+ "sha256:dcb773e3e275de896efebe57159a67587283d6ca1d1a36695170a3756fd2ef3a"
],
"index": "ia",
- "version": "==0.1.12"
+ "version": "==0.3.0"
},
"brotli": {
"hashes": [
+ "sha256:02177603aaca36e1fd21b091cb742bb3b305a569e2402f1ca38af471777fb019",
+ "sha256:11d3283d89af7033236fa4e73ec2cbe743d4f6a81d41bd234f24bf63dde979df",
+ "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
"sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
@@ -100,43 +107,75 @@
"sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
"sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
"sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
+ "sha256:3148362937217b7072cf80a2dcc007f09bb5ecb96dae4617316638194113d5be",
+ "sha256:330e3f10cd01da535c70d09c4283ba2df5fb78e915bea0a28becad6e2ac010be",
+ "sha256:336b40348269f9b91268378de5ff44dc6fbaa2268194f85177b53463d313842a",
+ "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
+ "sha256:3b8b09a16a1950b9ef495a0f8b9d0a87599a9d1f179e2d4ac014b2ec831f87e7",
+ "sha256:3c1306004d49b84bd0c4f90457c6f57ad109f5cc6067a9664e12b7b79a9948ad",
+ "sha256:3ffaadcaeafe9d30a7e4e1e97ad727e4f5610b9fa2f7551998471e3736738679",
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
"sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
+ "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
+ "sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
"sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
"sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
+ "sha256:5bf37a08493232fbb0f8229f1824b366c2fc1d02d64e7e918af40acd15f3e337",
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
"sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
"sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
+ "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
+ "sha256:73fd30d4ce0ea48010564ccee1a26bfe39323fde05cb34b5863455629db61dc7",
"sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
+ "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
"sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
"sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
+ "sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
"sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
"sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
+ "sha256:8ed6a5b3d23ecc00ea02e1ed8e0ff9a08f4fc87a1f58a2530e71c0f48adf882f",
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
"sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
"sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
"sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
"sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
"sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
+ "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
"sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
+ "sha256:b1375b5d17d6145c798661b67e4ae9d5496920d9265e2f00f1c2c0b5ae91fbde",
+ "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
+ "sha256:b3523f51818e8f16599613edddb1ff924eeb4b53ab7e7197f85cbc321cdca32f",
+ "sha256:b43775532a5904bc938f9c15b77c613cb6ad6fb30990f3b0afaea82797a402d8",
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
+ "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
+ "sha256:ba72d37e2a924717990f4d7482e8ac88e2ef43fb95491eb6e0d124d77d2a150d",
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
+ "sha256:c8e521a0ce7cf690ca84b8cc2272ddaf9d8a50294fd086da67e517439614c755",
+ "sha256:cab1b5964b39607a66adbba01f1c12df2e55ac36c81ec6ed44f2fca44178bf1a",
+ "sha256:cb02ed34557afde2d2da68194d12f5719ee96cfb2eacc886352cb73e3808fc5d",
+ "sha256:cc0283a406774f465fb45ec7efb66857c09ffefbe49ec20b7882eff6d3c86d3a",
"sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
"sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
+ "sha256:e1abbeef02962596548382e393f56e4c94acd286bd0c5afba756cffc33670e8a",
+ "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
+ "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
+ "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
+ "sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
+ "sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
"sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
],
"version": "==1.0.9"
@@ -150,142 +189,141 @@
},
"certifi": {
"hashes": [
- "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
- "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2021.10.8"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
"chardet": {
"hashes": [
- "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
- "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
+ "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5",
+ "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"
],
- "markers": "python_version >= '3.6'",
- "version": "==4.0.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.1.0"
},
"charset-normalizer": {
"hashes": [
- "sha256:1eecaa09422db5be9e29d7fc65664e6c33bd06f9ced7838578ba40d58bdf3721",
- "sha256:b0b883e8e874edfdece9c28f314e3dd5badf067342e42fb162203335ae61aa2c"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "markers": "python_version >= '3'",
- "version": "==2.0.9"
- },
- "click": {
- "hashes": [
- "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3",
- "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"
- ],
- "version": "==8.0.3"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
"configparser": {
"hashes": [
- "sha256:1b35798fdf1713f1c3139016cfcbc461f09edbf099d1fb658d4b7479fcaa3daa",
- "sha256:e8b39238fb6f0153a069aa253d349467c3c4737934f253ef6abac5fe0eca1e5d"
+ "sha256:8be267824b541c09b08db124917f48ab525a6c3e837011f3130781a224c57090",
+ "sha256:b065779fd93c6bf4cee42202fa4351b4bb842e96a3fb469440e484517a49b9fa"
],
- "version": "==5.2.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.3.0"
},
"confluent-kafka": {
"hashes": [
- "sha256:11173733e0540a98e493c91a05686ba4e777883c2cda756d47848fce84e06b30",
- "sha256:1246f3c674357630b078bbc76824eabea87ac5a9ca270886abca9c7f052381da",
- "sha256:1e8e7770eaf2f6df0a3620f0bfc5dc2293e6ca3ac1e14c4babe6fefc03f50e18",
- "sha256:2b4d8d53148a26f0cafcb42e9483f76473120bc091fa0ede497caf8cc8db6f88",
- "sha256:33c32de2357ddcd3f8a98a96591c69c7ada76215e051ed5dbb17b763921f376a",
- "sha256:415c23e7ccf948e50de616191febd4ec299b1d748ae0abdab3888f0ec0915ea9",
- "sha256:5d9c75822c0b1cb7787fc60a78b3f249bfd56b3a692dd079d9d7510ffefe2c99",
- "sha256:5e044e5c5fce78c87aedd56dbd7bd5c046dbf7a0bc9a0eff32229766be8808a5",
- "sha256:80e01b4791513c27eded8517af847530dfdf04c43d99ff132ed9c3085933b75b",
- "sha256:8bb0d7e28deac58b234f7481184a60f743838c4e06309fbcca9484b93697c33b",
- "sha256:955de681f2bc7241d580ebb43d7516f825950518bfaf2c8e6bc3c88d22be4f08",
- "sha256:9f5ff838f2ca87e467aa992f9fcb8bbdd222097690fe6b15aa733025a1613532",
- "sha256:aa5f2905783b1a4e560e4172e228e2174a077090cbdf91a5448dd8deac02b2a9",
- "sha256:aad712996e1465e806f7e027ad248b2474d2140a3985d5f7789a5ff68e5dba8a",
- "sha256:bbba1f144992fbd920cb10c7c2450e82fc8936e04272d36be3a3567bfbf768d4",
- "sha256:bc2ad89e6cc4e05c5855dfbee2838a699861943ab3ea62ff2b914d72fcd1a6c6",
- "sha256:c0b3fc70c31f636562464e905c2b75a2705d3d53bb4687fd48b574dee2a7fa51",
- "sha256:ee3f33077e3534b33cec9825843cd705ede458c585cfab2a052813391fb73291",
- "sha256:f2628f3ebffe05d346f0456c566d5519a59bd0aa88179a9b7408c1808415c102",
- "sha256:f98fa8982da1a960e6c1bfca49b235f8de45c8af83d6b741d78f96f346748488"
+ "sha256:24872e3e427b16f77461ae7e6cf48f9c5c03c06884ac51bad179580a4dd29145",
+ "sha256:2fb97bd25d436bd59fe079885aa77a3a2f23cface9c6359d4700053665849262",
+ "sha256:3207c76d1510571cbda85560c293dec5f8d6645103b3f471abab5c83e51a7ccd",
+ "sha256:344a7fec57d3348002392a7bd5cf66fb9dbe4a103e81636037cccd6fff944e28",
+ "sha256:382739e499deaf488459c2307ebcc0e9b3653340801d6053c207c84ad710ee8d",
+ "sha256:4d6bfcc352cd608fcf325037b4425c0edaeae0c6a5439423a865110b59f897e9",
+ "sha256:4f27ddf7daf630a95e1d7dfddd0c8cf8a7755c9567dc9851bf2e74c22f34af42",
+ "sha256:5b24587b30a4d288a7b1c5cc756ee707fc1293fa28454f8db40267ed9d7e73c8",
+ "sha256:6ab745babc33a864e3ca3a2659c005ed52503e39936fff5812eeb21920009c8b",
+ "sha256:7e6592533b3f8cfbc086ea2d472058f10e5f6a04a388edca01773285c63284b4",
+ "sha256:b9ad6ad9d58c2735129f94f044b2236d7de87d77a101c8c630363add12d62a4a",
+ "sha256:be7b37020f614017d8a047565b3fb61ceef9c30a9ee093f9373d06a4c32068ae",
+ "sha256:bef263b6d78a3e63399e1b82cc07cbb30af762884df96a369cba0e1011405344",
+ "sha256:c4b7c4d0b647952d2b506948131d6e7e1c42ccb16aac8e3e52369c16b94e7215",
+ "sha256:d036bf5e1d7cb3743125d7caf62b1a23b12e403de240144b6117ddbb8f815a33",
+ "sha256:d0cbf8e7510497afd651e134bccb9d579aa90234e45734046fcb6b752d2ee312",
+ "sha256:d533ea0e527122f177943ee35eb356b8d9f7af35fe357e0cdc0514d95804aaea",
+ "sha256:e41b9313c44f54a3cd29b0e95fa32a8e685edaa9287b338f59530b21ebc0b453",
+ "sha256:e9107767cc9240cbf9b5c0fdded5eeead86a1690d1c15de6cbbdcc9d7e3b1962",
+ "sha256:f96033c335da26ea1716ab9adfce459c211b023ca09528f958fb28bf099fc0df",
+ "sha256:f970a2c6d22c934ea68d645abcc96056ecb107489f28a38b2171f65655b7e41f",
+ "sha256:fe31b3b6930d67380df371f5088950f93da5fac580cde3bedb35f992b2498e1b",
+ "sha256:ff08b9f978f8b37f2961614a68f9fdb4fabd10cdd940234e80200806d93a1c30",
+ "sha256:ff4d1557b7fb72e752c36205a344863b8f4f23b3a834780fc36eb7ebde614de7"
],
"index": "ia",
- "version": "==1.7.0"
+ "version": "==1.9.2"
},
"contextlib2": {
"hashes": [
"sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f",
"sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869"
],
+ "markers": "python_version >= '3.6'",
"version": "==21.6.0"
},
"courlan": {
"hashes": [
- "sha256:2922aea7635d6a177d42ac93a3087d254c81fdc3b56178164bd933c8e3f061ab",
- "sha256:30dd02243951688275768b1025bccfd88396669210d7e435fa89960fe6c62faf"
+ "sha256:d06c5b048b2b5cd5c0ac77304dc24b795e4bb257a7b6077ea405a3b5e99ae179",
+ "sha256:d141d30f8e52d344cf9904aa29e4d8750e934026bdbca2dc7bd58b750566d058"
],
- "version": "==0.6.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
},
"crawllib": {
"hashes": [
- "sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea"
+ "sha256:9a30a10318dc706f1e27ff0af950ac14a77f73c18d329771f44d872fd63630e3"
],
- "version": "==0.1.4.8"
- },
- "cssselect": {
- "hashes": [
- "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
- "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
- ],
- "version": "==1.1.0"
+ "version": "==0.1.6"
},
"cython": {
"hashes": [
- "sha256:07d5b8ce032110822dad2eb09950a98b9e255d14c2daf094be32d663790b3365",
- "sha256:08a502fe08756070276d841c830cfc37254a2383d0a5bea736ffb78eff613c88",
- "sha256:0cf7c3033349d10c5eb33ded1a78974f680e95c245a585c18a2046c67f8ed461",
- "sha256:0e9e28eb6bb19f5e25f4bf5c8f8ea7db3bc4910309fab2305e5c9c5a5223db77",
- "sha256:1825d6f2160188dfe1faa0099d30ed0e5ae56826627bf0de6dcb8dcbcf64c9bd",
- "sha256:191978e5839ca425eb78f0f60a84ad5db7a07b97e8076f9853d0d12c3ccec5d4",
- "sha256:1c2f262f7d032ec0106534982609ae0148f86ba52fc747df64e645706af20926",
- "sha256:3379e67113e92fef490a88eca685b07b711bb4db1ddce66af9e460673a5335cc",
- "sha256:3497e366ffed67454162d31bf4bd2ac3aa183dfac089eb4124966c9f98bd9c05",
- "sha256:3913f6a50409ab36a5b8edbb4c3e4d441027f43150d8335e5118d34ef04c745c",
- "sha256:3e94eb973f99c1963973a46dbd9e3974a03b8fe0af3de02dc5d65b4c6a6f9b3f",
- "sha256:44cc749f288423182504a8fc8734070a369bf576734b9f0fafff40cd6b6e1b3e",
- "sha256:4dc3d230849d61844e6b5737ee624c896f51e98c8a5d13f965b02a7e735230be",
- "sha256:4ee99fab5191f403f33774fc92123291c002947338c2628b1ed42ed0017149dd",
- "sha256:4f7b135cba0d2509890e1dcff2005585bc3d51c9f17564b70d8bc82dc7ec3a5e",
- "sha256:5d0d97a5f661dccf2f9e14cf27fe9027f772d089fb92fdd3dd8a584d9b8a2916",
- "sha256:64394ec94d9a0e5002f77e67ee8ceed97f25b483b18ea6aab547f4d82ca32ef6",
- "sha256:6759b73a9a1013cbdac71ebefa284aa50617b5b32957a54eedaa22ac2f6d48de",
- "sha256:6efb798993260532879f683dc8ce9e30fd1ec86f02c926f1238a8e6a64576321",
- "sha256:79d2f84a6d87d45ef580c0441b5394c4f29344e05126a8e2fb4ba4144425f3b0",
- "sha256:7b3f6e4cfcc103bccdcbc666f613d669ac378c8918629296cdf8191c0c2ec418",
- "sha256:800cbe944886320e4a4b623becb97960ae9d7d80f2d12980b83bcfb63ff47d5b",
- "sha256:8726456c7e376410b3c631427da0a4affe1e481424436d1e3f1888cc3c0f8d2e",
- "sha256:a206a1f8ea11314e02dc01bf24f397b8f1b413bbcc0e031396caa1a126b060c2",
- "sha256:a87cbe3756e7c464acf3e9420d8741e62d3b2eace0846cb39f664ad378aab284",
- "sha256:aa9e1fe5ee0a4f9d2430c1e0665f40b48f4b511150ca02f69e9bb49dc48d4e0e",
- "sha256:b5b3e876e617fe2cf466d02198b76924dcda3cc162a1043226a9c181b9a662a6",
- "sha256:b6f397256cfab2d0f0af42659fca3232c23f5a570b6c21ed66aaac22dd95da15",
- "sha256:b8fc9c78262b140364ce1b28ac40ff505a47ac3fd4f86311d461df04a28b3f23",
- "sha256:c204cb2d005a426c5c83309fd7edea335ff5c514ffa6dc72ddac92cfde170b69",
- "sha256:d288f25e8abb43b1cfa2fe3d69b2d6236cca3ff6163d090e26c4b1e8ea80dfbf",
- "sha256:decd641167e97a3c1f973bf0bbb560d251809f6db8168c10edf94c0a1e5dec65",
- "sha256:e6fa0a7cec9461c5ca687f3c4bb59cf2565afb76c60303b2dc8b280c6e112810",
- "sha256:e96857ab2dbd8a67852341001f1f2a1ef3f1939d82aea1337497a8f76a9d7f6c",
- "sha256:eb64ec369eba2207fbe618650d78d9af0455e0c1abb301ec024fa9f3e17a15cc",
- "sha256:f95433e6963164de372fc1ef01574d7419d96ce45274f296299267d874b90800"
- ],
- "version": "==0.29.25"
+ "sha256:061e25151c38f2361bc790d3bcf7f9d9828a0b6a4d5afa56fbed3bd33fb2373a",
+ "sha256:06be83490c906b6429b4389e13487a26254ccaad2eef6f3d4ee21d8d3a4aaa2b",
+ "sha256:07d173d3289415bb496e72cb0ddd609961be08fe2968c39094d5712ffb78672b",
+ "sha256:0bbc27abdf6aebfa1bce34cd92bd403070356f28b0ecb3198ff8a182791d58b9",
+ "sha256:0ea8267fc373a2c5064ad77d8ff7bf0ea8b88f7407098ff51829381f8ec1d5d9",
+ "sha256:3875c2b2ea752816a4d7ae59d45bb546e7c4c79093c83e3ba7f4d9051dd02928",
+ "sha256:39afb4679b8c6bf7ccb15b24025568f4f9b4d7f9bf3cbd981021f542acecd75b",
+ "sha256:3f85eb2343d20d91a4ea9cf14e5748092b376a64b7e07fc224e85b2753e9070b",
+ "sha256:40eff7aa26e91cf108fd740ffd4daf49f39b2fdffadabc7292b4b7dc5df879f0",
+ "sha256:479690d2892ca56d34812fe6ab8f58e4b2e0129140f3d94518f15993c40553da",
+ "sha256:4a4b03ab483271f69221c3210f7cde0dcc456749ecf8243b95bc7a701e5677e0",
+ "sha256:513e9707407608ac0d306c8b09d55a28be23ea4152cbd356ceaec0f32ef08d65",
+ "sha256:5514f3b4122cb22317122a48e175a7194e18e1803ca555c4c959d7dfe68eaf98",
+ "sha256:5ba622326f2862f9c1f99ca8d47ade49871241920a352c917e16861e25b0e5c3",
+ "sha256:63b79d9e1f7c4d1f498ab1322156a0d7dc1b6004bf981a8abda3f66800e140cd",
+ "sha256:656dc5ff1d269de4d11ee8542f2ffd15ab466c447c1f10e5b8aba6f561967276",
+ "sha256:67fdd2f652f8d4840042e2d2d91e15636ba2bcdcd92e7e5ffbc68e6ef633a754",
+ "sha256:79e3bab19cf1b021b613567c22eb18b76c0c547b9bc3903881a07bfd9e7e64cf",
+ "sha256:856d2fec682b3f31583719cb6925c6cdbb9aa30f03122bcc45c65c8b6f515754",
+ "sha256:8669cadeb26d9a58a5e6b8ce34d2c8986cc3b5c0bfa77eda6ceb471596cb2ec3",
+ "sha256:8733cf4758b79304f2a4e39ebfac5e92341bce47bcceb26c1254398b2f8c1af7",
+ "sha256:97335b2cd4acebf30d14e2855d882de83ad838491a09be2011745579ac975833",
+ "sha256:afbce249133a830f121b917f8c9404a44f2950e0e4f5d1e68f043da4c2e9f457",
+ "sha256:b0595aee62809ba353cebc5c7978e0e443760c3e882e2c7672c73ffe46383673",
+ "sha256:b6da3063c5c476f5311fd76854abae6c315f1513ef7d7904deed2e774623bbb9",
+ "sha256:c8e8025f496b5acb6ba95da2fb3e9dacffc97d9a92711aacfdd42f9c5927e094",
+ "sha256:cddc47ec746a08603037731f5d10aebf770ced08666100bd2cdcaf06a85d4d1b",
+ "sha256:cdf10af3e2e3279dc09fdc5f95deaa624850a53913f30350ceee824dc14fc1a6",
+ "sha256:d968ffc403d92addf20b68924d95428d523436adfd25cf505d427ed7ba3bee8b",
+ "sha256:dbee03b8d42dca924e6aa057b836a064c769ddfd2a4c2919e65da2c8a362d528",
+ "sha256:e1958e0227a4a6a2c06fd6e35b7469de50adf174102454db397cec6e1403cce3",
+ "sha256:e6ffa08aa1c111a1ebcbd1cf4afaaec120bc0bbdec3f2545f8bb7d3e8e77a1cd",
+ "sha256:e83228e0994497900af954adcac27f64c9a57cd70a9ec768ab0cb2c01fd15cf1",
+ "sha256:ea1dcc07bfb37367b639415333cfbfe4a93c3be340edf1db10964bc27d42ed64",
+ "sha256:eca3065a1279456e81c615211d025ea11bfe4e19f0c5650b859868ca04b3fcbd",
+ "sha256:ed087eeb88a8cf96c60fb76c5c3b5fb87188adee5e179f89ec9ad9a43c0c54b3",
+ "sha256:eeb475eb6f0ccf6c039035eb4f0f928eb53ead88777e0a760eccb140ad90930b",
+ "sha256:eefd2b9a5f38ded8d859fe96cc28d7d06e098dc3f677e7adbafda4dcdd4a461c",
+ "sha256:f3fd44cc362eee8ae569025f070d56208908916794b6ab21e139cea56470a2b3",
+ "sha256:f9944013588a3543fca795fffb0a070a31a243aa4f2d212f118aa95e69485831"
+ ],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==0.29.32"
},
"dateparser": {
"hashes": [
- "sha256:faa2b97f51f3b5ff1ba2f17be90de2b733fb6191f89b4058787473e8202f3044",
- "sha256:fec344db1f73d005182e214c0ff27313c748bbe0c1638ce9d48a809ddfdab2a0"
+ "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c",
+ "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e"
],
"index": "ia",
- "version": "==1.1.0"
+ "version": "==1.1.4"
},
"dawg": {
"hashes": [
@@ -304,17 +342,11 @@
},
"decorator": {
"hashes": [
- "sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374",
- "sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
],
- "version": "==5.1.0"
- },
- "deprecated": {
- "hashes": [
- "sha256:43ac5335da90c31c24ba028af536a91d41d53f9e6901ddb021bcc572ce44e38d",
- "sha256:64756e3e14c8c5eea9795d93c524551432a0be75629f8f29e67ab8caf076c76d"
- ],
- "version": "==1.2.13"
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
},
"docopt": {
"hashes": [
@@ -330,47 +362,42 @@
},
"dynaconf": {
"hashes": [
- "sha256:e9d80b46ba4d9372f2f40c812594c963f74178140c0b596e57f2881001fc4d35",
- "sha256:f52fe5db7622da56a552275e8f64e4df46e3b4ae11158831b042e8ba2f6d1c96"
+ "sha256:87e0b3b12b5db9e8fb465e1f8c7fdb926cd2ec5b6d88aa7f821f316df93fb165",
+ "sha256:d9cfb50fd4a71a543fd23845d4f585b620b6ff6d9d3cc1825c614f7b2097cb39"
],
"index": "ia",
- "version": "==3.1.7"
+ "version": "==3.1.11"
},
"elasticsearch": {
"hashes": [
- "sha256:9a5a2fd53a4fce28f15f358ab13fbcfb06f47fb2c7400ea89c10d6fd3f236ecd",
- "sha256:d7f8665715ad80e3e99e42388bcc49c1b06162f72acfa1f8febe2baf5570b0ed"
+ "sha256:840adeb45a5ec9102a83f3cf481aae83a3775b75d6dd83a7310b04e44a5d0308",
+ "sha256:f511ea92e96db09b0e96b0de5fbbb7aa5c3740b0c571a364a2c3a1cc7ec06203"
],
- "version": "==7.16.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'",
+ "version": "==7.17.8"
},
"filelock": {
"hashes": [
- "sha256:2e139a228bcf56dd8b2274a65174d005c4a6b68540ee0bdbb92c76f43f29f7e8",
- "sha256:93d512b32a23baf4cac44ffd72ccf70732aeff7b8050fcaf6d3ec406d954baf4"
- ],
- "version": "==3.4.0"
- },
- "flask": {
- "hashes": [
- "sha256:7b2fb8e934ddd50731893bdcdb00fc8c0315916f9fcd50d22c7cc1a95ab634e2",
- "sha256:cb90f62f1d8e4dc4621f52106613488b5ba826b2e1e10a33eac92f723093ab6a"
+ "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2",
+ "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c"
],
- "index": "ia",
- "version": "==2.0.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.8.2"
},
"ftfy": {
"hashes": [
- "sha256:ba71121a9c8d7790d3e833c6c1021143f3e5c4118293ec3afb5d43ed9ca8e72b"
+ "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca",
+ "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"
],
"index": "ia",
- "version": "==6.0.3"
+ "version": "==6.1.1"
},
"globalwayback": {
"hashes": [
- "sha256:dbe993105f8e1b3b8346f43163d2a5524336b08207898bbd2ec9ea24fd997627"
+ "sha256:683f19dee720ef11335952aa33615e50c945196c82e18a5d8150635f92022d23"
],
"index": "ia",
- "version": "==0.8.2.1"
+ "version": "==0.8.12.6"
},
"grobid-tei-xml": {
"hashes": [
@@ -382,10 +409,11 @@
},
"htmldate": {
"hashes": [
- "sha256:075456bc736508f0343bbb6645e9f9c5dc97a30e4692e5c6b1e1c5a897454007",
- "sha256:3767aed93c72a9ac0a68ff907eefc373c183593507a8354e899229bac621da1b"
+ "sha256:603b86eaf0f076efcd653d57fe0470305f751417711f4e373279235d0ff587e6",
+ "sha256:83830715faf0f22272d9e24e571a4955308a008107d0ca9359c0de77b99766cd"
],
- "version": "==1.0.0"
+ "index": "ia",
+ "version": "==1.3.2"
},
"ialib": {
"hashes": [
@@ -398,50 +426,46 @@
"sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
"sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
],
- "markers": "python_version >= '3'",
"version": "==2.6"
},
"internetarchive": {
"hashes": [
- "sha256:ebd11ecd038c71e75a3aef8d87750b46480169ecaefb23074c4ae48440bf2836"
+ "sha256:de856465c2ef6852184d08bfd59c0ca01904865b373a27b383034ac6b4128eb6"
],
"index": "ia",
- "version": "==2.2.0"
- },
- "itsdangerous": {
- "hashes": [
- "sha256:5174094b9637652bdb841a3029700391451bd092ba3db90600dea710ba28e97c",
- "sha256:9e724d68fc22902a1435351f84c3fb8623f303fffcc566a4cb952df8c572cff0"
- ],
- "version": "==2.0.1"
+ "version": "==3.0.2"
},
"jinja2": {
"hashes": [
- "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
- "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
+ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
+ "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
- "version": "==3.0.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.1.2"
},
"jmespath": {
"hashes": [
- "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
- "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
+ "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980",
+ "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"
],
- "version": "==0.10.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.0.1"
},
"jsonpatch": {
"hashes": [
"sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397",
"sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"
],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.32"
},
"jsonpointer": {
"hashes": [
- "sha256:26d9a47a72d4dc3e3ae72c4c6cd432afd73c680164cd2540772eab53cb3823b6",
- "sha256:f09f8deecaaa5aea65b5eb4f67ca4e54e1a61f7a11c75085e360fe6feb6a48bf"
+ "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9",
+ "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"
],
- "version": "==2.2"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==2.3"
},
"justext": {
"hashes": [
@@ -460,143 +484,132 @@
},
"lxml": {
"hashes": [
- "sha256:08eb9200d88b376a8ed5e50f1dc1d1a45b49305169674002a3b5929943390591",
- "sha256:0b12c95542f04d10cba46b3ff28ea52ea56995b78cf918f0b11b05e75812bb79",
- "sha256:0c15e1cd55055956e77b0732270f1c6005850696bc3ef3e03d01e78af84eaa42",
- "sha256:15d0381feb56f08f78c5cc4fc385ddfe0bde1456e37f54a9322833371aec4060",
- "sha256:197b7cb7a753cf553a45115739afd8458464a28913da00f5c525063f94cd3f48",
- "sha256:20d7c8d90d449c6a353b15ee0459abae8395dbe59ad01e406ccbf30cd81c6f98",
- "sha256:240db6f3228d26e3c6f4fad914b9ddaaf8707254e8b3efd564dc680c8ec3c264",
- "sha256:2901625f4a878a055d275beedc20ba9cb359cefc4386a967222fee29eb236038",
- "sha256:2b06a91cf7b8acea7793006e4ae50646cef0fe35ce5acd4f5cb1c77eb228e4a1",
- "sha256:2eb90f6ec3c236ef2f1bb38aee7c0d23e77d423d395af6326e7cca637519a4cb",
- "sha256:351482da8dd028834028537f08724b1de22d40dcf3bb723b469446564f409074",
- "sha256:35752ee40f7bbf6adc9ff4e1f4b84794a3593736dcce80db32e3c2aa85e294ac",
- "sha256:38b9de0de3aa689fe9fb9877ae1be1e83b8cf9621f7e62049d0436b9ecf4ad64",
- "sha256:433df8c7dde0f9e41cbf4f36b0829d50a378116ef5e962ba3881f2f5f025c7be",
- "sha256:4341d135f5660db10184963d9c3418c3e28d7f868aaf8b11a323ebf85813f7f4",
- "sha256:45fdb2899c755138722797161547a40b3e2a06feda620cc41195ee7e97806d81",
- "sha256:4717123f7c11c81e0da69989e5a64079c3f402b0efeb4c6241db6c369d657bd8",
- "sha256:47e955112ce64241fdb357acf0216081f9f3255b3ac9c502ca4b3323ec1ca558",
- "sha256:48eaac2991b3036175b42ee8d3c23f4cca13f2be8426bf29401a690ab58c88f4",
- "sha256:4aa349c5567651f34d4eaae7de6ed5b523f6d70a288f9c6fbac22d13a0784e04",
- "sha256:4ba74afe5ee5cb5e28d83b513a6e8f0875fda1dc1a9aea42cc0065f029160d2a",
- "sha256:4ec9a80dd5704ecfde54319b6964368daf02848c8954d3bacb9b64d1c7659159",
- "sha256:50790313df028aa05cf22be9a8da033b86c42fa32523e4fd944827b482b17bf0",
- "sha256:51a0e5d243687596f46e24e464121d4b232ad772e2d1785b2a2c0eb413c285d4",
- "sha256:523f195948a1ba4f9f5b7294d83c6cd876547dc741820750a7e5e893a24bbe38",
- "sha256:543b239b191bb3b6d9bef5f09f1fb2be5b7eb09ab4d386aa655e4d53fbe9ff47",
- "sha256:5ff5bb2a198ea67403bb6818705e9a4f90e0313f2215428ec51001ce56d939fb",
- "sha256:601f0ab75538b280aaf1e720eb9d68d4fa104ac274e1e9e6971df488f4dcdb0f",
- "sha256:6020c70ff695106bf80651953a23e37718ef1fee9abd060dcad8e32ab2dc13f3",
- "sha256:619c6d2b552bba00491e96c0518aad94002651c108a0f7364ff2d7798812c00e",
- "sha256:6298f5b42a26581206ef63fffa97c754245d329414108707c525512a5197f2ba",
- "sha256:662523cd2a0246740225c7e32531f2e766544122e58bee70e700a024cfc0cf81",
- "sha256:6764998345552b1dfc9326a932d2bad6367c6b37a176bb73ada6b9486bf602f7",
- "sha256:6d422b3c729737d8a39279a25fa156c983a56458f8b2f97661ee6fb22b80b1d6",
- "sha256:72e730d33fe2e302fd07285f14624fca5e5e2fb2bb4fb2c3941e318c41c443d1",
- "sha256:75d3c5bbc0ddbad03bb68b9be638599f67e4b98ed3dcd0fec9f6f39e41ee96cb",
- "sha256:7ae7089d81fc502df4b217ad77f03c54039fe90dac0acbe70448d7e53bfbc57e",
- "sha256:80d10d53d3184837445ff8562021bdd37f57c4cadacbf9d8726cc16220a00d54",
- "sha256:877666418598f6cb289546c77ff87590cfd212f903b522b0afa0b9fb73b3ccfb",
- "sha256:9b87727561c1150c0cc91c5d9d389448b37a7d15f0ba939ed3d1acb2f11bf6c5",
- "sha256:9c91a73971a922c13070fd8fa5a114c858251791ba2122a941e6aa781c713e44",
- "sha256:9db24803fa71e3305fe4a7812782b708da21a0b774b130dd1860cf40a6d7a3ee",
- "sha256:a75c1ad05eedb1a3ff2a34a52a4f0836cfaa892e12796ba39a7732c82701eff4",
- "sha256:a77a3470ba37e11872c75ca95baf9b3312133a3d5a5dc720803b23098c653976",
- "sha256:ab6db93a2b6b66cbf62b4e4a7135f476e708e8c5c990d186584142c77d7f975a",
- "sha256:afd60230ad9d8bcba005945ec3a343722f09e0b7f8ae804246e5d2cfc6bd71a6",
- "sha256:b0ca0ada9d3bc18bd6f611bd001a28abdd49ab9698bd6d717f7f5394c8e94628",
- "sha256:b567178a74a2261345890eac66fbf394692a6e002709d329f28a673ca6042473",
- "sha256:b667c51682fe9b9788c69465956baa8b6999531876ccedcafc895c74ad716cd8",
- "sha256:bbf2dc330bd44bfc0254ab37677ec60f7c7ecea55ad8ba1b8b2ea7bf20c265f5",
- "sha256:bdc224f216ead849e902151112efef6e96c41ee1322e15d4e5f7c8a826929aee",
- "sha256:cf201bf5594d1aab139fe53e3fca457e4f8204a5bbd65d48ab3b82a16f517868",
- "sha256:d43bd68714049c84e297c005456a15ecdec818f7b5aa5868c8b0a865cfb78a44",
- "sha256:daf9bd1fee31f1c7a5928b3e1059e09a8d683ea58fb3ffc773b6c88cb8d1399c",
- "sha256:e678a643177c0e5ec947b645fa7bc84260dfb9b6bf8fb1fdd83008dfc2ca5928",
- "sha256:e91d24623e747eeb2d8121f4a94c6a7ad27dc48e747e2dc95bfe88632bd028a2",
- "sha256:e95da348d57eb448d226a44b868ff2ca5786fbcbe417ac99ff62d0a7d724b9c7",
- "sha256:ee9e4b07b0eba4b6a521509e9e1877476729c1243246b6959de697ebea739643",
- "sha256:f5dd358536b8a964bf6bd48de038754c1609e72e5f17f5d21efe2dda17594dbf",
- "sha256:ffd65cfa33fed01735c82aca640fde4cc63f0414775cba11e06f84fae2085a6e"
- ],
- "markers": "python_version >= '3.5'",
- "version": "==4.6.4"
+ "sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7",
+ "sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726",
+ "sha256:05ca3f6abf5cf78fe053da9b1166e062ade3fa5d4f92b4ed688127ea7d7b1d03",
+ "sha256:090c6543d3696cbe15b4ac6e175e576bcc3f1ccfbba970061b7300b0c15a2140",
+ "sha256:0dc313ef231edf866912e9d8f5a042ddab56c752619e92dfd3a2c277e6a7299a",
+ "sha256:0f2b1e0d79180f344ff9f321327b005ca043a50ece8713de61d1cb383fb8ac05",
+ "sha256:13598ecfbd2e86ea7ae45ec28a2a54fb87ee9b9fdb0f6d343297d8e548392c03",
+ "sha256:16efd54337136e8cd72fb9485c368d91d77a47ee2d42b057564aae201257d419",
+ "sha256:1ab8f1f932e8f82355e75dda5413a57612c6ea448069d4fb2e217e9a4bed13d4",
+ "sha256:223f4232855ade399bd409331e6ca70fb5578efef22cf4069a6090acc0f53c0e",
+ "sha256:2455cfaeb7ac70338b3257f41e21f0724f4b5b0c0e7702da67ee6c3640835b67",
+ "sha256:2899456259589aa38bfb018c364d6ae7b53c5c22d8e27d0ec7609c2a1ff78b50",
+ "sha256:2a29ba94d065945944016b6b74e538bdb1751a1db6ffb80c9d3c2e40d6fa9894",
+ "sha256:2a87fa548561d2f4643c99cd13131acb607ddabb70682dcf1dff5f71f781a4bf",
+ "sha256:2e430cd2824f05f2d4f687701144556646bae8f249fd60aa1e4c768ba7018947",
+ "sha256:36c3c175d34652a35475a73762b545f4527aec044910a651d2bf50de9c3352b1",
+ "sha256:3818b8e2c4b5148567e1b09ce739006acfaa44ce3156f8cbbc11062994b8e8dd",
+ "sha256:3ab9fa9d6dc2a7f29d7affdf3edebf6ece6fb28a6d80b14c3b2fb9d39b9322c3",
+ "sha256:3efea981d956a6f7173b4659849f55081867cf897e719f57383698af6f618a92",
+ "sha256:4c8f293f14abc8fd3e8e01c5bd86e6ed0b6ef71936ded5bf10fe7a5efefbaca3",
+ "sha256:5344a43228767f53a9df6e5b253f8cdca7dfc7b7aeae52551958192f56d98457",
+ "sha256:58bfa3aa19ca4c0f28c5dde0ff56c520fbac6f0daf4fac66ed4c8d2fb7f22e74",
+ "sha256:5b4545b8a40478183ac06c073e81a5ce4cf01bf1734962577cf2bb569a5b3bbf",
+ "sha256:5f50a1c177e2fa3ee0667a5ab79fdc6b23086bc8b589d90b93b4bd17eb0e64d1",
+ "sha256:63da2ccc0857c311d764e7d3d90f429c252e83b52d1f8f1d1fe55be26827d1f4",
+ "sha256:6749649eecd6a9871cae297bffa4ee76f90b4504a2a2ab528d9ebe912b101975",
+ "sha256:6804daeb7ef69e7b36f76caddb85cccd63d0c56dedb47555d2fc969e2af6a1a5",
+ "sha256:689bb688a1db722485e4610a503e3e9210dcc20c520b45ac8f7533c837be76fe",
+ "sha256:699a9af7dffaf67deeae27b2112aa06b41c370d5e7633e0ee0aea2e0b6c211f7",
+ "sha256:6b418afe5df18233fc6b6093deb82a32895b6bb0b1155c2cdb05203f583053f1",
+ "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2",
+ "sha256:7b515674acfdcadb0eb5d00d8a709868173acece5cb0be3dd165950cbfdf5409",
+ "sha256:7b770ed79542ed52c519119473898198761d78beb24b107acf3ad65deae61f1f",
+ "sha256:7d2278d59425777cfcb19735018d897ca8303abe67cc735f9f97177ceff8027f",
+ "sha256:7e91ee82f4199af8c43d8158024cbdff3d931df350252288f0d4ce656df7f3b5",
+ "sha256:821b7f59b99551c69c85a6039c65b75f5683bdc63270fec660f75da67469ca24",
+ "sha256:822068f85e12a6e292803e112ab876bc03ed1f03dddb80154c395f891ca6b31e",
+ "sha256:8340225bd5e7a701c0fa98284c849c9b9fc9238abf53a0ebd90900f25d39a4e4",
+ "sha256:85cabf64adec449132e55616e7ca3e1000ab449d1d0f9d7f83146ed5bdcb6d8a",
+ "sha256:880bbbcbe2fca64e2f4d8e04db47bcdf504936fa2b33933efd945e1b429bea8c",
+ "sha256:8d0b4612b66ff5d62d03bcaa043bb018f74dfea51184e53f067e6fdcba4bd8de",
+ "sha256:8e20cb5a47247e383cf4ff523205060991021233ebd6f924bca927fcf25cf86f",
+ "sha256:925073b2fe14ab9b87e73f9a5fde6ce6392da430f3004d8b72cc86f746f5163b",
+ "sha256:998c7c41910666d2976928c38ea96a70d1aa43be6fe502f21a651e17483a43c5",
+ "sha256:9b22c5c66f67ae00c0199f6055705bc3eb3fcb08d03d2ec4059a2b1b25ed48d7",
+ "sha256:9f102706d0ca011de571de32c3247c6476b55bb6bc65a20f682f000b07a4852a",
+ "sha256:a08cff61517ee26cb56f1e949cca38caabe9ea9fbb4b1e10a805dc39844b7d5c",
+ "sha256:a0a336d6d3e8b234a3aae3c674873d8f0e720b76bc1d9416866c41cd9500ffb9",
+ "sha256:a35f8b7fa99f90dd2f5dc5a9fa12332642f087a7641289ca6c40d6e1a2637d8e",
+ "sha256:a38486985ca49cfa574a507e7a2215c0c780fd1778bb6290c21193b7211702ab",
+ "sha256:a5da296eb617d18e497bcf0a5c528f5d3b18dadb3619fbdadf4ed2356ef8d941",
+ "sha256:a6e441a86553c310258aca15d1c05903aaf4965b23f3bc2d55f200804e005ee5",
+ "sha256:a82d05da00a58b8e4c0008edbc8a4b6ec5a4bc1e2ee0fb6ed157cf634ed7fa45",
+ "sha256:ab323679b8b3030000f2be63e22cdeea5b47ee0abd2d6a1dc0c8103ddaa56cd7",
+ "sha256:b1f42b6921d0e81b1bcb5e395bc091a70f41c4d4e55ba99c6da2b31626c44892",
+ "sha256:b23e19989c355ca854276178a0463951a653309fb8e57ce674497f2d9f208746",
+ "sha256:b264171e3143d842ded311b7dccd46ff9ef34247129ff5bf5066123c55c2431c",
+ "sha256:b26a29f0b7fc6f0897f043ca366142d2b609dc60756ee6e4e90b5f762c6adc53",
+ "sha256:b64d891da92e232c36976c80ed7ebb383e3f148489796d8d31a5b6a677825efe",
+ "sha256:b9cc34af337a97d470040f99ba4282f6e6bac88407d021688a5d585e44a23184",
+ "sha256:bc718cd47b765e790eecb74d044cc8d37d58562f6c314ee9484df26276d36a38",
+ "sha256:be7292c55101e22f2a3d4d8913944cbea71eea90792bf914add27454a13905df",
+ "sha256:c83203addf554215463b59f6399835201999b5e48019dc17f182ed5ad87205c9",
+ "sha256:c9ec3eaf616d67db0764b3bb983962b4f385a1f08304fd30c7283954e6a7869b",
+ "sha256:ca34efc80a29351897e18888c71c6aca4a359247c87e0b1c7ada14f0ab0c0fb2",
+ "sha256:ca989b91cf3a3ba28930a9fc1e9aeafc2a395448641df1f387a2d394638943b0",
+ "sha256:d02a5399126a53492415d4906ab0ad0375a5456cc05c3fc0fc4ca11771745cda",
+ "sha256:d17bc7c2ccf49c478c5bdd447594e82692c74222698cfc9b5daae7ae7e90743b",
+ "sha256:d5bf6545cd27aaa8a13033ce56354ed9e25ab0e4ac3b5392b763d8d04b08e0c5",
+ "sha256:d6b430a9938a5a5d85fc107d852262ddcd48602c120e3dbb02137c83d212b380",
+ "sha256:da248f93f0418a9e9d94b0080d7ebc407a9a5e6d0b57bb30db9b5cc28de1ad33",
+ "sha256:da4dd7c9c50c059aba52b3524f84d7de956f7fef88f0bafcf4ad7dde94a064e8",
+ "sha256:df0623dcf9668ad0445e0558a21211d4e9a149ea8f5666917c8eeec515f0a6d1",
+ "sha256:e5168986b90a8d1f2f9dc1b841467c74221bd752537b99761a93d2d981e04889",
+ "sha256:efa29c2fe6b4fdd32e8ef81c1528506895eca86e1d8c4657fda04c9b3786ddf9",
+ "sha256:f1496ea22ca2c830cbcbd473de8f114a320da308438ae65abad6bab7867fe38f",
+ "sha256:f49e52d174375a7def9915c9f06ec4e569d235ad428f70751765f48d5926678c"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==4.9.2"
},
"markupsafe": {
"hashes": [
- "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
- "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
- "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
- "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
- "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
- "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
- "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
- "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
- "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
- "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
- "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
- "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
- "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
- "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
- "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
- "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
- "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
- "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
- "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
- "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
- "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
- "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
- "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
- "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
- "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
- "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
- "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
- "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
- "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
- "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
- "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
- "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
- "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
- "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
- "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
- "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
- "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
- "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
- "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
- "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
- "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
- "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
- "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
- "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
- "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
- "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
- "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
- "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
- "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
- "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
- "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
- "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
- "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
- "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
- "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
- "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
- "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
- "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
- "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
- "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
- "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
- "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
- "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
- "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
- "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
- "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
- "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
- "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
- "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
- ],
- "version": "==2.0.1"
+ "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
+ "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
+ "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
+ "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
+ "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
+ "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
+ "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
+ "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
+ "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
+ "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
+ "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
+ "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
+ "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
+ "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
+ "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
+ "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
+ "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
+ "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
+ "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
+ "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
+ "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
+ "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
+ "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
+ "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
+ "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
+ "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
+ "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
+ "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
+ "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
+ "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
+ "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
+ "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
+ "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
+ "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
+ "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
+ "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
+ "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
+ "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
+ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
+ "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.1.1"
},
"minio": {
"hashes": [
@@ -607,6 +620,12 @@
"index": "ia",
"version": "==6.0.2"
},
+ "perfstat": {
+ "hashes": [
+ "sha256:4f91fab9be6076972c66fe818eed488be28f1044009237adccce42ff2c7861f5"
+ ],
+ "version": "==0.1.0.1"
+ },
"pillow": {
"hashes": [
"sha256:02d2ae1d7c311e6fa038abda0843683ae652c9292d723270c85deeb04a1001a8",
@@ -664,20 +683,22 @@
},
"psycopg2": {
"hashes": [
- "sha256:26322c3f114de1f60c1b0febf8fdd595c221b4f624524178f515d07350a71bd1",
- "sha256:6796ac614412ce374587147150e56d03b7845c9e031b88aacdcadc880e81bb38",
- "sha256:77b9105ef37bc005b8ffbcb1ed6d8685bb0e8ce84773738aa56421a007ec5a7a",
- "sha256:77d09a79f9739b97099d2952bbbf18eaa4eaf825362387acbb9552ec1b3fa228",
- "sha256:91c7fd0fe9e6c118e8ff5b665bc3445781d3615fa78e131d0b4f8c85e8ca9ec8",
- "sha256:a761b60da0ecaf6a9866985bcde26327883ac3cdb90535ab68b8d784f02b05ef",
- "sha256:a84da9fa891848e0270e8e04dcca073bc9046441eeb47069f5c0e36783debbea",
- "sha256:b8816c6410fa08d2a022e4e38d128bae97c1855e176a00493d6ec62ccd606d57",
- "sha256:dfc32db6ce9ecc35a131320888b547199f79822b028934bb5b332f4169393e15",
- "sha256:f65cba7924363e0d2f416041b48ff69d559548f2cb168ff972c54e09e1e64db8",
- "sha256:fd7ddab7d6afee4e21c03c648c8b667b197104713e57ec404d5b74097af21e31"
+ "sha256:093e3894d2d3c592ab0945d9eba9d139c139664dcf83a1c440b8a7aa9bb21955",
+ "sha256:190d51e8c1b25a47484e52a79638a8182451d6f6dff99f26ad9bd81e5359a0fa",
+ "sha256:1a5c7d7d577e0eabfcf15eb87d1e19314c8c4f0e722a301f98e0e3a65e238b4e",
+ "sha256:1e5a38aa85bd660c53947bd28aeaafb6a97d70423606f1ccb044a03a1203fe4a",
+ "sha256:322fd5fca0b1113677089d4ebd5222c964b1760e361f151cbb2706c4912112c5",
+ "sha256:4cb9936316d88bfab614666eb9e32995e794ed0f8f6b3b718666c22819c1d7ee",
+ "sha256:920bf418000dd17669d2904472efeab2b20546efd0548139618f8fa305d1d7ad",
+ "sha256:922cc5f0b98a5f2b1ff481f5551b95cd04580fd6f0c72d9b22e6c0145a4840e0",
+ "sha256:a5246d2e683a972e2187a8714b5c2cf8156c064629f9a9b1a873c1730d9e245a",
+ "sha256:b9ac1b0d8ecc49e05e4e182694f418d27f3aedcfca854ebd6c05bb1cffa10d6d",
+ "sha256:d3ef67e630b0de0779c42912fe2cbae3805ebaba30cda27fea2a3de650a9414f",
+ "sha256:f5b6320dbc3cf6cfb9f25308286f9f7ab464e65cfb105b64cc9c52831748ced2",
+ "sha256:fc04dd5189b90d825509caa510f20d1d504761e78b8dfb95a0ede180f71d50e5"
],
"index": "ia",
- "version": "==2.9.2"
+ "version": "==2.9.5"
},
"publicsuffix": {
"hashes": [
@@ -687,175 +708,170 @@
},
"pydantic": {
"hashes": [
- "sha256:021ea0e4133e8c824775a0cfe098677acf6fa5a3cbf9206a376eed3fc09302cd",
- "sha256:05ddfd37c1720c392f4e0d43c484217b7521558302e7069ce8d318438d297739",
- "sha256:05ef5246a7ffd2ce12a619cbb29f3307b7c4509307b1b49f456657b43529dc6f",
- "sha256:10e5622224245941efc193ad1d159887872776df7a8fd592ed746aa25d071840",
- "sha256:18b5ea242dd3e62dbf89b2b0ec9ba6c7b5abaf6af85b95a97b00279f65845a23",
- "sha256:234a6c19f1c14e25e362cb05c68afb7f183eb931dd3cd4605eafff055ebbf287",
- "sha256:244ad78eeb388a43b0c927e74d3af78008e944074b7d0f4f696ddd5b2af43c62",
- "sha256:26464e57ccaafe72b7ad156fdaa4e9b9ef051f69e175dbbb463283000c05ab7b",
- "sha256:41b542c0b3c42dc17da70554bc6f38cbc30d7066d2c2815a94499b5684582ecb",
- "sha256:4a03cbbe743e9c7247ceae6f0d8898f7a64bb65800a45cbdc52d65e370570820",
- "sha256:4be75bebf676a5f0f87937c6ddb061fa39cbea067240d98e298508c1bda6f3f3",
- "sha256:54cd5121383f4a461ff7644c7ca20c0419d58052db70d8791eacbbe31528916b",
- "sha256:589eb6cd6361e8ac341db97602eb7f354551482368a37f4fd086c0733548308e",
- "sha256:8621559dcf5afacf0069ed194278f35c255dc1a1385c28b32dd6c110fd6531b3",
- "sha256:8b223557f9510cf0bfd8b01316bf6dd281cf41826607eada99662f5e4963f316",
- "sha256:99a9fc39470010c45c161a1dc584997f1feb13f689ecf645f59bb4ba623e586b",
- "sha256:a7c6002203fe2c5a1b5cbb141bb85060cbff88c2d78eccbc72d97eb7022c43e4",
- "sha256:a83db7205f60c6a86f2c44a61791d993dff4b73135df1973ecd9eed5ea0bda20",
- "sha256:ac8eed4ca3bd3aadc58a13c2aa93cd8a884bcf21cb019f8cfecaae3b6ce3746e",
- "sha256:e710876437bc07bd414ff453ac8ec63d219e7690128d925c6e82889d674bb505",
- "sha256:ea5cb40a3b23b3265f6325727ddfc45141b08ed665458be8c6285e7b85bd73a1",
- "sha256:fec866a0b59f372b7e776f2d7308511784dace622e0992a0b59ea3ccee0ae833"
+ "sha256:05e00dbebbe810b33c7a7362f231893183bcc4251f3f2ff991c31d5c08240c42",
+ "sha256:06094d18dd5e6f2bbf93efa54991c3240964bb663b87729ac340eb5014310624",
+ "sha256:0b959f4d8211fc964772b595ebb25f7652da3f22322c007b6fed26846a40685e",
+ "sha256:19b3b9ccf97af2b7519c42032441a891a5e05c68368f40865a90eb88833c2559",
+ "sha256:1b6ee725bd6e83ec78b1aa32c5b1fa67a3a65badddde3976bca5fe4568f27709",
+ "sha256:1ee433e274268a4b0c8fde7ad9d58ecba12b069a033ecc4645bb6303c062d2e9",
+ "sha256:216f3bcbf19c726b1cc22b099dd409aa371f55c08800bcea4c44c8f74b73478d",
+ "sha256:2d0567e60eb01bccda3a4df01df677adf6b437958d35c12a3ac3e0f078b0ee52",
+ "sha256:2e05aed07fa02231dbf03d0adb1be1d79cabb09025dd45aa094aa8b4e7b9dcda",
+ "sha256:352aedb1d71b8b0736c6d56ad2bd34c6982720644b0624462059ab29bd6e5912",
+ "sha256:355639d9afc76bcb9b0c3000ddcd08472ae75318a6eb67a15866b87e2efa168c",
+ "sha256:37c90345ec7dd2f1bcef82ce49b6235b40f282b94d3eec47e801baf864d15525",
+ "sha256:4b8795290deaae348c4eba0cebb196e1c6b98bdbe7f50b2d0d9a4a99716342fe",
+ "sha256:5760e164b807a48a8f25f8aa1a6d857e6ce62e7ec83ea5d5c5a802eac81bad41",
+ "sha256:6eb843dcc411b6a2237a694f5e1d649fc66c6064d02b204a7e9d194dff81eb4b",
+ "sha256:7b5ba54d026c2bd2cb769d3468885f23f43710f651688e91f5fb1edcf0ee9283",
+ "sha256:7c2abc4393dea97a4ccbb4ec7d8658d4e22c4765b7b9b9445588f16c71ad9965",
+ "sha256:81a7b66c3f499108b448f3f004801fcd7d7165fb4200acb03f1c2402da73ce4c",
+ "sha256:91b8e218852ef6007c2b98cd861601c6a09f1aa32bbbb74fab5b1c33d4a1e410",
+ "sha256:9300fcbebf85f6339a02c6994b2eb3ff1b9c8c14f502058b5bf349d42447dcf5",
+ "sha256:9cabf4a7f05a776e7793e72793cd92cc865ea0e83a819f9ae4ecccb1b8aa6116",
+ "sha256:a1f5a63a6dfe19d719b1b6e6106561869d2efaca6167f84f5ab9347887d78b98",
+ "sha256:a4c805731c33a8db4b6ace45ce440c4ef5336e712508b4d9e1aafa617dc9907f",
+ "sha256:ae544c47bec47a86bc7d350f965d8b15540e27e5aa4f55170ac6a75e5f73b644",
+ "sha256:b97890e56a694486f772d36efd2ba31612739bc6f3caeee50e9e7e3ebd2fdd13",
+ "sha256:bb6ad4489af1bac6955d38ebcb95079a836af31e4c4f74aba1ca05bb9f6027bd",
+ "sha256:bedf309630209e78582ffacda64a21f96f3ed2e51fbf3962d4d488e503420254",
+ "sha256:c1ba1afb396148bbc70e9eaa8c06c1716fdddabaf86e7027c5988bae2a829ab6",
+ "sha256:c33602f93bfb67779f9c507e4d69451664524389546bacfe1bee13cae6dc7488",
+ "sha256:c4aac8e7103bf598373208f6299fa9a5cfd1fc571f2d40bf1dd1955a63d6eeb5",
+ "sha256:c6f981882aea41e021f72779ce2a4e87267458cc4d39ea990729e21ef18f0f8c",
+ "sha256:cc78cc83110d2f275ec1970e7a831f4e371ee92405332ebfe9860a715f8336e1",
+ "sha256:d49f3db871575e0426b12e2f32fdb25e579dea16486a26e5a0474af87cb1ab0a",
+ "sha256:dd3f9a40c16daf323cf913593083698caee97df2804aa36c4b3175d5ac1b92a2",
+ "sha256:e0bedafe4bc165ad0a56ac0bd7695df25c50f76961da29c050712596cf092d6d",
+ "sha256:e9069e1b01525a96e6ff49e25876d90d5a563bc31c658289a8772ae186552236"
],
"index": "ia",
- "version": "==1.8.2"
+ "version": "==1.10.2"
},
"pylru": {
"hashes": [
- "sha256:492f934bb98dc6c8b2370c02c95c65516ddc08c8f64d27f70087eb038621d297"
+ "sha256:47ad140a63ab9389648dadfbb4330700e0ffeeb28ec04664ee47d37ed133b0f4",
+ "sha256:b7c75b0676e2fbae647823bc209e23998772867d3679f1583c7350a9b02a59f0"
],
- "version": "==1.2.0"
+ "version": "==1.2.1"
},
"pymupdf": {
"hashes": [
- "sha256:00b2606e2ecc486fa97b1f91e86ac21133193a915f07aa74993b2f06b3fe9e2e",
- "sha256:06a85007504c91261c6c4d1e876ed88ba052b1efe7a712870880106e7cbabe90",
- "sha256:09fe247a26ee15b2ff5a422adf6d9b00f0e7781799c97e62ec684f2100e35b7a",
- "sha256:0dceea82ebe8738ce3a3895b11e7b89a4df7275763d75b9da4e2df2ebc8a716c",
- "sha256:1c0b85602220d659d50035232f46c48ab080c4b51043d4baab3b0b402e454279",
- "sha256:37627ed2cc5abbbb527279423b0a41486e8b32729c09c2d66c8ab32ee1710407",
- "sha256:3907f59221da3e1547c379bb3b5551df489433175d036494a15cb5791c20a677",
- "sha256:481227be40ff7e67f262e8d71e37608a419d0168a16db875ebbf3266fb5e768f",
- "sha256:6f630ff21223f90f40b4a2e3f3bbfad0349b7710309c2fe9e9a5524ffd9ef414",
- "sha256:7de28525e8d67f6767677a7a4db0c3bf3aa4c34ed99c0dfff36d8e9f6dbcde4b",
- "sha256:964bbacddab9cba6cd2c8f388429fe4a97c0775b3096a13ac15724f5a1a2c58d",
- "sha256:98278c948e51a8be804e07bfe57668a8d3d7620d7dfefec2ee5c660cacbe68ac",
- "sha256:a8a5cb859ec64fd732e525a861bfc76e12c145cac839b9501be3f8529cf1abf8",
- "sha256:a8f5afd83dc622351f5c1693a25e2378baaafd80b02c6f909938b75fcd41963c",
- "sha256:b324e91c92f39f1e34c2984e0c11407ac6f7299a5fc83b3239644dca00f949f4",
- "sha256:c4e044ff278e36bd4bce1748bd7270d2c768b26fbee3701d0eabb017824c1b90",
- "sha256:da0268dc8a094cfd06692c82a9bf670b0809e255241158bdf220b51fc54453a5",
- "sha256:db1a45fe2d11fd3eb321c3f7b729e9442cbdfc8a5dd65f1002c9aa166cb3a30c",
- "sha256:e6aaa8fb46c8238998ab2d0ce56c556b28ccef5de6a779cf0e6ebd5f8409e223",
- "sha256:f19ced62493b627bf2ff3b968949b0b223f8d535301bd504f9508bddc448244d",
- "sha256:f56f3ccec0b09c2426de19fef60c59b1754934ab686614056956a478e9f4f59e"
+ "sha256:05c54acf69ee55ef97453f9c52982ef2839c188fe464d6b4cdc053bd4c6298f1",
+ "sha256:11b913664c059146e512e8559ebd9f976570ef21c0338c953836bc02051c1d7e",
+ "sha256:13ed689e5ad4c3adecb7586050de8baaa1819f48e2c57ca4e87f80e3b2727cb3",
+ "sha256:164dc67f1f5db3b22207b2aeba0fadff0503123c8f31c46768b7da7d3595a181",
+ "sha256:1e7b85e2611a9cca7a410e4c5a510a11131de7c5da9379e46615a8d3adfa6df5",
+ "sha256:38188f88a6e648b9f3a87d29de5b4ed52f910827a15859b183f1321c68e6ac00",
+ "sha256:39192c009afd8dd877a79ed02519ec8d17699bec9e9543115e490f06a553e200",
+ "sha256:4c5e7211b85e13050ac6e25879d4f0476b7a04f23bd3b6442489cec9f8da8418",
+ "sha256:7281324a0325dd30c033644cc8654167dcbfe47c4b1d49805d407fa5a64ce76b",
+ "sha256:909fb46900e7422515291761a1294902cf163226ec8918ea4c3454537336dfeb",
+ "sha256:945529b7868f9fe290b11dfbc37e2b9012610fac9763686ccf91a4d968305c5e",
+ "sha256:976fb0e93f025617890f8f8d8517371684131aa0e9fc0c1d0b4cd8bd564cce27",
+ "sha256:9998f7dfa0f99d6c2c3eb0dcfbfd44433247c23c4b781bc45f76dab421bc554b",
+ "sha256:a3b8e5c2de6192c89f379283aa07aa7fd044098dab43a8cd3ac172e961caf286",
+ "sha256:b0db8c81b6c781e373ed005f7595e49b760f91edb3b36d1dc69ec29b4fad34f8",
+ "sha256:c03004415a6d140b2c4bb494bb507c9ccbd55d713407e3b5bc1dd35fa45f2be0",
+ "sha256:cfd6c666b02a066e9e76d9ce8ca5e7fa4f2bf7a8ce6934cd2837b08509d46f8e",
+ "sha256:dffe67c5574d0ebb1e39b5ecf806fb4fd4ddb01bee5630f516ece4468252c9f0",
+ "sha256:ef3d13e27f1585d776f6a2597f113aabd28d36b648b983a72850b21c5399ab08",
+ "sha256:f04086036d40af50e5d6f54e949fa12eacda2d752562a2f85215763b137bf864",
+ "sha256:f3f96bd465e9e0e2960bb70e92233af0865181b9dd8ac5bc6b159d79584df2fe"
],
"index": "ia",
- "version": "==1.19.2"
+ "version": "==1.19.6"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==2.8.2"
},
"python-magic": {
"hashes": [
- "sha256:4fec8ee805fea30c07afccd1592c0f17977089895bdfaae5fec870a84e997626",
- "sha256:de800df9fb50f8ec5974761054a708af6e4246b03b4bdaee993f948947b0ebcf"
+ "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b",
+ "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"
],
"index": "ia",
- "version": "==0.4.24"
+ "version": "==0.4.27"
},
"python-poppler": {
"hashes": [
- "sha256:6843398adc9c290035646c4cf3c7bfcea9c8e04390bb9cd8fdc9bd063fb77880"
+ "sha256:8b6a157e51cbb4c08353a21ca3f6f396558759cdfb0b80071379ad89d5f7c533"
],
"index": "ia",
- "version": "==0.2.2"
+ "version": "==0.3.0"
},
"python-snappy": {
"hashes": [
- "sha256:0094411996b5c4e621fc44f8b733ba3594e4cb621374905fe9b64a9c98d6503d",
- "sha256:0439b4850eb36157d85ee348649f074654ef919794653a143acb1cb02c977e3a",
- "sha256:052b89ce69fd69af0bb6759450ae769807710fc97e92b5dbafc55c12231b7374",
- "sha256:0cf6ff9006a9a68b6b7de0bedf815c6e7211b978d5f84b95e875c016e85e1645",
- "sha256:0d0089ba7d07bb96667a0925c341d47562236d07377ed80aa5748b5993397ebc",
- "sha256:131edc7701d3def8b72ef6cd61ab491d9efd7b92976dbababb424d74b9c7b180",
- "sha256:168a98d3f597b633cfeeae7fe1c78a8dfd81f018b866cf7ce9e4c56086af891a",
- "sha256:16bd1aaff4c738eff6ab953467c51a2bfd8679a0c033aa5d8a17da5822646037",
- "sha256:186755897c002a78cac75e7b943849fba52f18819c86c83d2bf8bf617a9fd04d",
- "sha256:19a96759c8fc695986b691357bb2624bbc5190a2eb1839fb923fbc5aeec46a8f",
- "sha256:1ddc688d2164f3b99f1c7caf1cf137d70af201fe97fd727c5dbe7ec92ac0f1e6",
- "sha256:217745ade5ecb95db9ff029de9f8574d9e9847943a8f4aec6c44ae21567b34d7",
- "sha256:2354d255f7bd0bb2fbca6422a866c3adf93ff739a67f5cee1111db1e1796fac3",
- "sha256:2ee33fb4115c03037297b852da05be4af5106aaa89cee48393edcab784a3e2b7",
- "sha256:2efb42b0fcfa77c361a13951393b33cca60c02144b857e919d26aa0778c44994",
- "sha256:317059e6432bf9f58b48c566891fe2c552048241365ec2b7259db9e7908dd605",
- "sha256:36017d60beaab06eee3f197f685f5bc44801e7847753848b66d2beef73ed4e62",
- "sha256:4e79fa210e839b783d9e3ae71e1d68fe9cd319a258a74545221b340bada186f8",
- "sha256:55534761c2cd67cb5daae0adf96cf70cfa927365ce1dd8754b7f2e5580207c22",
- "sha256:57ba1052560edf457e776ed31104e3aec6137e0946e2d3f9485c814546101ac1",
- "sha256:5972dea9ead8a6bd5fa26b839e811a19977d1de446d1086ad2de6b0f95cba0d4",
- "sha256:5abe02e95f8001b99ac7f283ef86162d9cf12bc2cb1dc8b6fb39195ac10393df",
- "sha256:5c06962e2675bcb8667c9c0325f9ca73b2bc2f1f49ff90b52fca2dbb996d1d65",
- "sha256:615098b7f77bfd8373cedbd51bbafa8f11524299a2c9daa85b377f6f276b4186",
- "sha256:65be516490973f57c16962cdd511db56813fa5bcb96c627726580aa5cb967507",
- "sha256:6a6b6311af0941151008f5c07f8ff906e27f3cbc2960c79f5c26c95ad21fa873",
- "sha256:6ba085292a44e3f485d2831cb782204a83c990acace602aaef1fc6c2c69a3a5d",
- "sha256:70bb314e9bbbe072e094d5efde42f1e8fac64f1711acc89356c5d9851fc360cf",
- "sha256:71472441b63c0d5afbb9b81a19c0dafbaf073d8ab1bbfc70450ec81d3c4b86dc",
- "sha256:738fb370d0dd6bca71b0d5402f001602e859ffd8c5dcd7bc8493ef49d0e44297",
- "sha256:7b785a5c3163c5b0d41dedb0795e973eea32ba3b336b412b3123ec45e8bbd73a",
- "sha256:7cf48fb7bb598906c595ae90b502a1801e85540d7b7bcbba7faeb46534beec5a",
- "sha256:7e6d700b1f9be03e53c35a15b5bb51c473327d321632b64ecdab71542078ac2d",
- "sha256:7ef899f2704784032a1f020e983b29ef5c519203ffd0d5a17ff5e1f751b8dba6",
- "sha256:80e361065b87b5b65413cf45f2c9040fb85ed5329c1bccfbceca18ebee02a927",
- "sha256:87f5994aef0a1f1fde01904421106c9006015d97e7b13aa72998f5a942093603",
- "sha256:8b392483ece13646514262900e65296c01d671453cc417e045e2970f4855decd",
- "sha256:8d29b136799a3e059d22e37a3c80046e39d50baf1b0e973feb9450fbd7772fce",
- "sha256:9022e25a33ecadad1e5ff0cd1c6c60832ba3b4633b1a4d30e107b2a1bfad502d",
- "sha256:90537df2ba4a6da52ca55ab888d4d2ee410a3c4365d70bb4a2530d77afa992d6",
- "sha256:90fea26840e6b163b735d32c3497aa82cfe7b590ec840f25bbaa091f9ba2324f",
- "sha256:95be50dd65e6a46570642645ea2bbcd62bce5a58e822319b15dbdedfcd9fa717",
- "sha256:a2fae9ac351106b27f9506f34a3e9970d596fb0906b752c6fdc2e46596070ffe",
- "sha256:a3414e310ebed867709cfd903dd70ca5359491b3513dee97c64dc7306e526d66",
- "sha256:a515d6ec630822d43e79c77abfd1a80b1b4b00ba7b366d4e1f100cd853508e0a",
- "sha256:a5bf7a01b82ea3bbc7b1bd1e5060c9602ea664c8ffc31140fc9becc2fa175c4c",
- "sha256:ae88dabe6ab63446f4a15b3ed206e0fc5c9de609d81d8d41b7dc083618ffd4fb",
- "sha256:af47c221ec28eb46d35b6875b1c9119b525c0fc2f36e0eb246af942c03d3066a",
- "sha256:b4f25808d7d98954e51a581970e9968e23d6019d6aca9d9abc809e8b0a25a3a7",
- "sha256:c00c6224881bed690e547fc9712831a955fc97064f211f888b00fe6df1501e98",
- "sha256:c238ab412aeacce3fc82a322a3a4f12b99bc9d456112e213a3e85bff594f64e6",
- "sha256:cdfb9378eb00aca697f4539f4e14f61f2ccf984ee8c5496bf474795df372fade",
- "sha256:d1ec57f0f91e76d9fa2bc3e2a0473c91d6feca6541ce41fac38d4581950ce31a",
- "sha256:d4505e1b555ba6d42f691d78575c1c9ecd99045929f0a601192258b699d4dee4",
- "sha256:d64c7eefdef56a70d47c2a73955d2116d4ee64865313b488d6a64cfeb5ba5600",
- "sha256:d65f6c3e5539d657b5914d28f2ea7f8f4aa86b333a4a7cdd9fdc34f7fe5be33a",
- "sha256:d731cee6fae8904626b1af1edf56bd16d4c8e709afddc96bee785dd3345d209b",
- "sha256:d8cacada9824eafc8306fdd5f73ff57348c56f113e5676b86c29f92a71b6595f",
- "sha256:d9684a7ffb6e65313b92c2ec1934033c91d9fe8265c6cdd87412cf057d0066a5",
- "sha256:d9e97db64de3b6e58e26720b313c17bec701d38f393cf1576a06105f9dcdc2e8",
- "sha256:dc09e1a07895e4b5ec77fd83445ce835d9f2f4446967acbc4de2dc72e5bbae4a",
- "sha256:e17b56d64d86f519f5da1e64d5149ac93a6f093dc9338cd3e4066f51937c4c5c",
- "sha256:e3b9d50f0e7fe89cc94a51f540d6085f483317556dfe8aa96a16d6f8f247f76a",
- "sha256:f1ef8efafef0cc0f81f36e709b30088004a163612a8c1bbddb7b007bfb1900ff",
- "sha256:f200c606e9302c992f831d504a83aa37f3f84b48215e454b44adff601a2705ea",
- "sha256:f34008c8ec08c193a44ad5aa12a59bee768e32f33117c806562feda22a3397bd",
- "sha256:f722f3fde021da707074f3b834fa2b89e3ebdc3766177c7f77e2055cc36ec2a3",
- "sha256:f90fa653530d7a989e79580b9f5e804e8d6733c6f0d76535facf56d1e30492b5",
- "sha256:fd88b7b254d67fde2d8f3539e9ff6d8170573e546a0d430c629b2eb64fd35f37",
- "sha256:fe85cbf99291c2eb66b7d7f4a15dd1e1203ffd96d4c87b32408e3a23b8c894e5"
+ "sha256:03bb511380fca2a13325b6f16fe8234c8e12da9660f0258cd45d9a02ffc916af",
+ "sha256:0bdb6942180660bda7f7d01f4c0def3cfc72b1c6d99aad964801775a3e379aba",
+ "sha256:0d489b50f49433494160c45048fe806de6b3aeab0586e497ebd22a0bab56e427",
+ "sha256:1a993dc8aadd901915a510fe6af5f20ae4256f527040066c22a154db8946751f",
+ "sha256:1d029f7051ec1bbeaa3e03030b6d8ed47ceb69cae9016f493c802a08af54e026",
+ "sha256:277757d5dad4e239dc1417438a0871b65b1b155beb108888e7438c27ffc6a8cc",
+ "sha256:2a7e528ab6e09c0d67dcb61a1730a292683e5ff9bb088950638d3170cf2a0a54",
+ "sha256:2aaaf618c68d8c9daebc23a20436bd01b09ee70d7fbf7072b7f38b06d2fab539",
+ "sha256:2be4f4550acd484912441f5f1209ba611ac399aac9355fee73611b9a0d4f949c",
+ "sha256:39692bedbe0b717001a99915ac0eb2d9d0bad546440d392a2042b96d813eede1",
+ "sha256:3fb9a88a4dd6336488f3de67ce75816d0d796dce53c2c6e4d70e0b565633c7fd",
+ "sha256:4038019b1bcaadde726a57430718394076c5a21545ebc5badad2c045a09546cf",
+ "sha256:463fd340a499d47b26ca42d2f36a639188738f6e2098c6dbf80aef0e60f461e1",
+ "sha256:4d3cafdf454354a621c8ab7408e45aa4e9d5c0b943b61ff4815f71ca6bdf0130",
+ "sha256:4ec533a8c1f8df797bded662ec3e494d225b37855bb63eb0d75464a07947477c",
+ "sha256:530bfb9efebcc1aab8bb4ebcbd92b54477eed11f6cf499355e882970a6d3aa7d",
+ "sha256:546c1a7470ecbf6239101e9aff0f709b68ca0f0268b34d9023019a55baa1f7c6",
+ "sha256:5843feb914796b1f0405ccf31ea0fb51034ceb65a7588edfd5a8250cb369e3b2",
+ "sha256:586724a0276d7a6083a17259d0b51622e492289a9998848a1b01b6441ca12b2f",
+ "sha256:59e975be4206cc54d0a112ef72fa3970a57c2b1bcc2c97ed41d6df0ebe518228",
+ "sha256:5a453c45178d7864c1bdd6bfe0ee3ed2883f63b9ba2c9bb967c6b586bf763f96",
+ "sha256:5bb05c28298803a74add08ba496879242ef159c75bc86a5406fac0ffc7dd021b",
+ "sha256:5e973e637112391f05581f427659c05b30b6843bc522a65be35ac7b18ce3dedd",
+ "sha256:66c80e9b366012dbee262bb1869e4fc5ba8786cda85928481528bc4a72ec2ee8",
+ "sha256:6a7620404da966f637b9ce8d4d3d543d363223f7a12452a575189c5355fc2d25",
+ "sha256:6f8bf4708a11b47517baf962f9a02196478bbb10fdb9582add4aa1459fa82380",
+ "sha256:735cd4528c55dbe4516d6d2b403331a99fc304f8feded8ae887cf97b67d589bb",
+ "sha256:7778c224efc38a40d274da4eb82a04cac27aae20012372a7db3c4bbd8926c4d4",
+ "sha256:8277d1f6282463c40761f802b742f833f9f2449fcdbb20a96579aa05c8feb614",
+ "sha256:88b6ea78b83d2796f330b0af1b70cdd3965dbdab02d8ac293260ec2c8fe340ee",
+ "sha256:8c07220408d3268e8268c9351c5c08041bc6f8c6172e59d398b71020df108541",
+ "sha256:8d0c019ee7dcf2c60e240877107cddbd95a5b1081787579bf179938392d66480",
+ "sha256:90b0186516b7a101c14764b0c25931b741fb0102f21253eff67847b4742dfc72",
+ "sha256:9837ac1650cc68d22a3cf5f15fb62c6964747d16cecc8b22431f113d6e39555d",
+ "sha256:9eac51307c6a1a38d5f86ebabc26a889fddf20cbba7a116ccb54ba1446601d5b",
+ "sha256:9f0c0d88b84259f93c3aa46398680646f2c23e43394779758d9f739c34e15295",
+ "sha256:a0ad38bc98d0b0497a0b0dbc29409bcabfcecff4511ed7063403c86de16927bc",
+ "sha256:b265cde49774752aec9ca7f5d272e3f98718164afc85521622a8a5394158a2b5",
+ "sha256:b6a107ab06206acc5359d4c5632bd9b22d448702a79b3169b0c62e0fb808bb2a",
+ "sha256:b7f920eaf46ebf41bd26f9df51c160d40f9e00b7b48471c3438cb8d027f7fb9b",
+ "sha256:c20498bd712b6e31a4402e1d027a1cd64f6a4a0066a3fe3c7344475886d07fdf",
+ "sha256:cb18d9cd7b3f35a2f5af47bb8ed6a5bdbf4f3ddee37f3daade4ab7864c292f5b",
+ "sha256:cf5bb9254e1c38aacf253d510d3d9be631bba21f3d068b17672b38b5cbf2fff5",
+ "sha256:d017775851a778ec9cc32651c4464079d06d927303c2dde9ae9830ccf6fe94e1",
+ "sha256:dc96668d9c7cc656609764275c5f8da58ef56d89bdd6810f6923d36497468ff7",
+ "sha256:e066a0586833d610c4bbddba0be5ba0e3e4f8e0bc5bb6d82103d8f8fc47bb59a",
+ "sha256:e3a013895c64352b49d0d8e107a84f99631b16dbab156ded33ebf0becf56c8b2",
+ "sha256:eaf905a580f2747c4a474040a5063cd5e0cc3d1d2d6edb65f28196186493ad4a"
],
"index": "ia",
- "version": "==0.6.0"
+ "version": "==0.6.1"
},
"pytz": {
"hashes": [
- "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
- "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
+ "sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a",
+ "sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd"
],
- "version": "==2021.3"
+ "version": "==2022.7"
},
"pytz-deprecation-shim": {
"hashes": [
"sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
"sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==0.1.0.post0"
},
"pyyaml": {
@@ -876,124 +892,210 @@
],
"version": "==5.3.1"
},
- "pyyaml-include": {
- "hashes": [
- "sha256:60047fc0a186debb9c80b3c3a23ef73b0096f6ee4ab2124b29d642c3fea0aebe",
- "sha256:d74c7209f5150d841a529acf47d7a023769c9160949f366fb4e17efa8999c4be"
- ],
- "version": "==1.2.post2"
- },
- "raven": {
- "extras": [
- "flask"
- ],
- "hashes": [
- "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
- "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
- ],
- "index": "ia",
- "version": "==6.10.0"
- },
- "readability-lxml": {
- "hashes": [
- "sha256:e0d366a21b1bd6cca17de71a4e6ea16fcfaa8b0a5b4004e39e2c7eff884e6305",
- "sha256:e51fea56b5909aaf886d307d48e79e096293255afa567b7d08bca94d25b1a4e1"
- ],
- "version": "==0.8.1"
+ "rapidfuzz": {
+ "hashes": [
+ "sha256:020858dd89b60ce38811cd6e37875c4c3c8d7fcd8bc20a0ad2ed1f464b34dc4e",
+ "sha256:042644133244bfa7b20de635d500eb9f46af7097f3d90b1724f94866f17cb55e",
+ "sha256:08590905a95ccfa43f4df353dcc5d28c15d70664299c64abcad8721d89adce4f",
+ "sha256:114810491efb25464016fd554fdf1e20d390309cecef62587494fc474d4b926f",
+ "sha256:1333fb3d603d6b1040e365dca4892ba72c7e896df77a54eae27dc07db90906e3",
+ "sha256:16080c05a63d6042643ae9b6cfec1aefd3e61cef53d0abe0df3069b9d4b72077",
+ "sha256:16ffad751f43ab61001187b3fb4a9447ec2d1aedeff7c5bac86d3b95f9980cc3",
+ "sha256:1f50d1227e6e2a0e3ae1fb1c9a2e1c59577d3051af72c7cab2bcc430cb5e18da",
+ "sha256:1fbad8fb28d98980f5bff33c7842efef0315d42f0cd59082108482a7e6b61410",
+ "sha256:23524635840500ce6f4d25005c9529a97621689c85d2f727c52eed1782839a6a",
+ "sha256:24d3fea10680d085fd0a4d76e581bfb2b1074e66e78fd5964d4559e1fcd2a2d4",
+ "sha256:24eb6b843492bdc63c79ee4b2f104059b7a2201fef17f25177f585d3be03405a",
+ "sha256:25b4cedf2aa19fb7212894ce5f5219010cce611b60350e9a0a4d492122e7b351",
+ "sha256:27be9c63215d302ede7d654142a2e21f0d34ea6acba512a4ae4cfd52bbaa5b59",
+ "sha256:2c836f0f2d33d4614c3fbaf9a1eb5407c0fe23f8876f47fd15b90f78daa64c34",
+ "sha256:3a9bd02e1679c0fd2ecf69b72d0652dbe2a9844eaf04a36ddf4adfbd70010e95",
+ "sha256:3d8b081988d0a49c486e4e845a547565fee7c6e7ad8be57ff29c3d7c14c6894c",
+ "sha256:3dcffe1f3cbda0dc32133a2ae2255526561ca594f15f9644384549037b355245",
+ "sha256:3f11a7eff7bc6301cd6a5d43f309e22a815af07e1f08eeb2182892fca04c86cb",
+ "sha256:42085d4b154a8232767de8296ac39c8af5bccee6b823b0507de35f51c9cbc2d7",
+ "sha256:424f82c35dbe4f83bdc3b490d7d696a1dc6423b3d911460f5493b7ffae999fd2",
+ "sha256:43fb8cb030f888c3f076d40d428ed5eb4331f5dd6cf1796cfa39c67bf0f0fc1e",
+ "sha256:460853983ab88f873173e27cc601c5276d469388e6ad6e08c4fd57b2a86f1064",
+ "sha256:467c1505362823a5af12b10234cb1c4771ccf124c00e3fc9a43696512bd52293",
+ "sha256:46b9b8aa09998bc48dd800854e8d9b74bc534d7922c1d6e1bbf783e7fa6ac29c",
+ "sha256:53dcae85956853b787c27c1cb06f18bb450e22cf57a4ad3444cf03b8ff31724a",
+ "sha256:585206112c294e335d84de5d5f179c0f932837752d7420e3de21db7fdc476278",
+ "sha256:5ada0a14c67452358c1ee52ad14b80517a87b944897aaec3e875279371a9cb96",
+ "sha256:5e2b3d020219baa75f82a4e24b7c8adcb598c62f0e54e763c39361a9e5bad510",
+ "sha256:6120f2995f5154057454c5de99d86b4ef3b38397899b5da1265467e8980b2f60",
+ "sha256:68a89bb06d5a331511961f4d3fa7606f8e21237467ba9997cae6f67a1c2c2b9e",
+ "sha256:7496e8779905b02abc0ab4ba2a848e802ab99a6e20756ffc967a0de4900bd3da",
+ "sha256:759a3361711586a29bc753d3d1bdb862983bd9b9f37fbd7f6216c24f7c972554",
+ "sha256:75c45dcd595f8178412367e302fd022860ea025dc4a78b197b35428081ed33d5",
+ "sha256:7d005e058d86f2a968a8d28ca6f2052fab1f124a39035aa0523261d6baf21e1f",
+ "sha256:7f7930adf84301797c3f09c94b9c5a9ed90a9e8b8ed19b41d2384937e0f9f5bd",
+ "sha256:8109e0324d21993d5b2d111742bf5958f3516bf8c59f297c5d1cc25a2342eb66",
+ "sha256:81642a24798851b118f82884205fc1bd9ff70b655c04018c467824b6ecc1fabc",
+ "sha256:8450d15f7765482e86ef9be2ad1a05683cd826f59ad236ef7b9fb606464a56aa",
+ "sha256:875d51b3497439a72e2d76183e1cb5468f3f979ab2ddfc1d1f7dde3b1ecfb42f",
+ "sha256:8b477b43ced896301665183a5e0faec0f5aea2373005648da8bdcb3c4b73f280",
+ "sha256:8d3e252d4127c79b4d7c2ae47271636cbaca905c8bb46d80c7930ab906cf4b5c",
+ "sha256:916bc2e6cf492c77ad6deb7bcd088f0ce9c607aaeabc543edeb703e1fbc43e31",
+ "sha256:988f8f6abfba7ee79449f8b50687c174733b079521c3cc121d65ad2d38831846",
+ "sha256:99a84ab9ac9a823e7e93b4414f86344052a5f3e23b23aa365cda01393ad895bd",
+ "sha256:9be02162af0376d64b840f2fc8ee3366794fc149f1e06d095a6a1d42447d97c5",
+ "sha256:a5585189b3d90d81ccd62d4f18530d5ac8972021f0aaaa1ffc6af387ff1dce75",
+ "sha256:ae33a72336059213996fe4baca4e0e4860913905c2efb7c991eab33b95a98a0a",
+ "sha256:af4f7c3c904ca709493eb66ca9080b44190c38e9ecb3b48b96d38825d5672559",
+ "sha256:b20141fa6cee041917801de0bab503447196d372d4c7ee9a03721b0a8edf5337",
+ "sha256:b3210869161a864f3831635bb13d24f4708c0aa7208ef5baac1ac4d46e9b4208",
+ "sha256:b34e8c0e492949ecdd5da46a1cfc856a342e2f0389b379b1a45a3cdcd3176a6e",
+ "sha256:b52ac2626945cd21a2487aeefed794c14ee31514c8ae69b7599170418211e6f6",
+ "sha256:b5dd713a1734574c2850c566ac4286594bacbc2d60b9170b795bee4b68656625",
+ "sha256:b5f705652360d520c2de52bee11100c92f59b3e3daca308ebb150cbc58aecdad",
+ "sha256:b6389c50d8d214c9cd11a77f6d501529cb23279a9c9cafe519a3a4b503b5f72a",
+ "sha256:b6bad92de071cbffa2acd4239c1779f66851b60ffbbda0e4f4e8a2e9b17e7eef",
+ "sha256:b75dd0928ce8e216f88660ab3d5c5ffe990f4dd682fd1709dba29d5dafdde6de",
+ "sha256:c2523f8180ebd9796c18d809e9a19075a1060b1a170fde3799e83db940c1b6d5",
+ "sha256:c31022d9970177f6affc6d5dd757ed22e44a10890212032fabab903fdee3bfe7",
+ "sha256:c36fd260084bb636b9400bb92016c6bd81fd80e59ed47f2466f85eda1fc9f782",
+ "sha256:c3741cb0bf9794783028e8b0cf23dab917fa5e37a6093b94c4c2f805f8e36b9f",
+ "sha256:c3fbe449d869ea4d0909fc9d862007fb39a584fb0b73349a6aab336f0d90eaed",
+ "sha256:c66546e30addb04a16cd864f10f5821272a1bfe6462ee5605613b4f1cb6f7b48",
+ "sha256:c71d9d512b76f05fa00282227c2ae884abb60e09f08b5ca3132b7e7431ac7f0d",
+ "sha256:c8601a66fbfc0052bb7860d2eacd303fcde3c14e87fdde409eceff516d659e77",
+ "sha256:c88adbcb933f6b8612f6c593384bf824e562bb35fc8a0f55fac690ab5b3486e5",
+ "sha256:ca00fafd2756bc9649bf80f1cf72c647dce38635f0695d7ce804bc0f759aa756",
+ "sha256:ca8a23097c1f50e0fdb4de9e427537ca122a18df2eead06ed39c3a0bef6d9d3a",
+ "sha256:cda1e2f66bb4ba7261a0f4c2d052d5d909798fca557cbff68f8a79a87d66a18f",
+ "sha256:cdfc04f7647c29fb48da7a04082c34cdb16f878d3c6d098d62d5715c0ad3000c",
+ "sha256:cf62dacb3f9234f3fddd74e178e6d25c68f2067fde765f1d95f87b1381248f58",
+ "sha256:d00df2e4a81ffa56a6b1ec4d2bc29afdcb7f565e0b8cd3092fece2290c4c7a79",
+ "sha256:d248a109699ce9992304e79c1f8735c82cc4c1386cd8e27027329c0549f248a2",
+ "sha256:d63def9bbc6b35aef4d76dc740301a4185867e8870cbb8719ec9de672212fca8",
+ "sha256:d82f20c0060ffdaadaf642b88ab0aa52365b56dffae812e188e5bdb998043588",
+ "sha256:dbcf5371ea704759fcce772c66a07647751d1f5dbdec7818331c9b31ae996c77",
+ "sha256:e8914dad106dacb0775718e54bf15e528055c4e92fb2677842996f2d52da5069",
+ "sha256:ebe303cd9839af69dd1f7942acaa80b1ba90bacef2e7ded9347fbed4f1654672",
+ "sha256:ec55a81ac2b0f41b8d6fb29aad16e55417036c7563bad5568686931aa4ff08f7",
+ "sha256:effe182767d102cb65dfbbf74192237dbd22d4191928d59415aa7d7c861d8c88",
+ "sha256:f42b82f268689f429def9ecfb86fa65ceea0eaf3fed408b570fe113311bf5ce7",
+ "sha256:f6fe570e20e293eb50491ae14ddeef71a6a7e5f59d7e791393ffa99b13f1f8c2",
+ "sha256:f799d1d6c33d81e983d3682571cc7d993ae7ff772c19b3aabb767039c33f6d1e",
+ "sha256:f891b98f8bc6c9d521785816085e9657212621e93f223917fb8e32f318b2957e",
+ "sha256:fa263135b892686e11d5b84f6a1892523123a00b7e5882eff4fbdabb38667347",
+ "sha256:fa4c598ed77f74ec973247ca776341200b0f93ec3883e34c222907ce72cb92a4",
+ "sha256:fe56659ccadbee97908132135de4b875543353351e0c92e736b7c57aee298b5a",
+ "sha256:fe59a0c21a032024edb0c8e43f5dee5623fef0b65a1e3c1281836d9ce199af3b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.13.7"
},
"redis": {
"hashes": [
- "sha256:c8481cf414474e3497ec7971a1ba9b998c8efad0f0d289a009a5bbef040894f9",
- "sha256:ccf692811f2c1fc7a92b466aa2599e4a6d2d73d5f736a2c70be600657c0da34a"
+ "sha256:7b8c87d19c45d3f1271b124858d2a5c13160c4e74d4835e28273400fa34d5228",
+ "sha256:cae3ee5d1f57d8caf534cd8764edf3163c77e073bdd74b6f54a87ffafdc5e7d9"
],
- "version": "==4.0.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"regex": {
"hashes": [
- "sha256:0416f7399e918c4b0e074a0f66e5191077ee2ca32a0f99d4c187a62beb47aa05",
- "sha256:05b7d6d7e64efe309972adab77fc2af8907bb93217ec60aa9fe12a0dad35874f",
- "sha256:0617383e2fe465732af4509e61648b77cbe3aee68b6ac8c0b6fe934db90be5cc",
- "sha256:07856afef5ffcc052e7eccf3213317fbb94e4a5cd8177a2caa69c980657b3cb4",
- "sha256:0f594b96fe2e0821d026365f72ac7b4f0b487487fb3d4aaf10dd9d97d88a9737",
- "sha256:139a23d1f5d30db2cc6c7fd9c6d6497872a672db22c4ae1910be22d4f4b2068a",
- "sha256:162abfd74e88001d20cb73ceaffbfe601469923e875caf9118333b1a4aaafdc4",
- "sha256:2207ae4f64ad3af399e2d30dde66f0b36ae5c3129b52885f1bffc2f05ec505c8",
- "sha256:2409b5c9cef7054dde93a9803156b411b677affc84fca69e908b1cb2c540025d",
- "sha256:2fee3ed82a011184807d2127f1733b4f6b2ff6ec7151d83ef3477f3b96a13d03",
- "sha256:30ab804ea73972049b7a2a5c62d97687d69b5a60a67adca07eb73a0ddbc9e29f",
- "sha256:3598893bde43091ee5ca0a6ad20f08a0435e93a69255eeb5f81b85e81e329264",
- "sha256:3b5df18db1fccd66de15aa59c41e4f853b5df7550723d26aa6cb7f40e5d9da5a",
- "sha256:3c5fb32cc6077abad3bbf0323067636d93307c9fa93e072771cf9a64d1c0f3ef",
- "sha256:416c5f1a188c91e3eb41e9c8787288e707f7d2ebe66e0a6563af280d9b68478f",
- "sha256:42b50fa6666b0d50c30a990527127334d6b96dd969011e843e726a64011485da",
- "sha256:432bd15d40ed835a51617521d60d0125867f7b88acf653e4ed994a1f8e4995dc",
- "sha256:473e67837f786404570eae33c3b64a4b9635ae9f00145250851a1292f484c063",
- "sha256:4aaa4e0705ef2b73dd8e36eeb4c868f80f8393f5f4d855e94025ce7ad8525f50",
- "sha256:50a7ddf3d131dc5633dccdb51417e2d1910d25cbcf842115a3a5893509140a3a",
- "sha256:529801a0d58809b60b3531ee804d3e3be4b412c94b5d267daa3de7fadef00f49",
- "sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d",
- "sha256:53db2c6be8a2710b359bfd3d3aa17ba38f8aa72a82309a12ae99d3c0c3dcd74d",
- "sha256:5537f71b6d646f7f5f340562ec4c77b6e1c915f8baae822ea0b7e46c1f09b733",
- "sha256:563d5f9354e15e048465061509403f68424fef37d5add3064038c2511c8f5e00",
- "sha256:5d408a642a5484b9b4d11dea15a489ea0928c7e410c7525cd892f4d04f2f617b",
- "sha256:61600a7ca4bcf78a96a68a27c2ae9389763b5b94b63943d5158f2a377e09d29a",
- "sha256:6650f16365f1924d6014d2ea770bde8555b4a39dc9576abb95e3cd1ff0263b36",
- "sha256:666abff54e474d28ff42756d94544cdfd42e2ee97065857413b72e8a2d6a6345",
- "sha256:68a067c11463de2a37157930d8b153005085e42bcb7ad9ca562d77ba7d1404e0",
- "sha256:6e1d2cc79e8dae442b3fa4a26c5794428b98f81389af90623ffcc650ce9f6732",
- "sha256:74cbeac0451f27d4f50e6e8a8f3a52ca074b5e2da9f7b505c4201a57a8ed6286",
- "sha256:780b48456a0f0ba4d390e8b5f7c661fdd218934388cde1a974010a965e200e12",
- "sha256:788aef3549f1924d5c38263104dae7395bf020a42776d5ec5ea2b0d3d85d6646",
- "sha256:7ee1227cf08b6716c85504aebc49ac827eb88fcc6e51564f010f11a406c0a667",
- "sha256:7f301b11b9d214f83ddaf689181051e7f48905568b0c7017c04c06dfd065e244",
- "sha256:83ee89483672b11f8952b158640d0c0ff02dc43d9cb1b70c1564b49abe92ce29",
- "sha256:85bfa6a5413be0ee6c5c4a663668a2cad2cbecdee367630d097d7823041bdeec",
- "sha256:9345b6f7ee578bad8e475129ed40123d265464c4cfead6c261fd60fc9de00bcf",
- "sha256:93a5051fcf5fad72de73b96f07d30bc29665697fb8ecdfbc474f3452c78adcf4",
- "sha256:962b9a917dd7ceacbe5cd424556914cb0d636001e393b43dc886ba31d2a1e449",
- "sha256:96fc32c16ea6d60d3ca7f63397bff5c75c5a562f7db6dec7d412f7c4d2e78ec0",
- "sha256:98ba568e8ae26beb726aeea2273053c717641933836568c2a0278a84987b2a1a",
- "sha256:a3feefd5e95871872673b08636f96b61ebef62971eab044f5124fb4dea39919d",
- "sha256:a955b747d620a50408b7fdf948e04359d6e762ff8a85f5775d907ceced715129",
- "sha256:b43c2b8a330a490daaef5a47ab114935002b13b3f9dc5da56d5322ff218eeadb",
- "sha256:b483c9d00a565633c87abd0aaf27eb5016de23fed952e054ecc19ce32f6a9e7e",
- "sha256:b9ed0b1e5e0759d6b7f8e2f143894b2a7f3edd313f38cf44e1e15d360e11749b",
- "sha256:ba05430e819e58544e840a68b03b28b6d328aff2e41579037e8bab7653b37d83",
- "sha256:ca49e1ab99593438b204e00f3970e7a5f70d045267051dfa6b5f4304fcfa1dbf",
- "sha256:ca5f18a75e1256ce07494e245cdb146f5a9267d3c702ebf9b65c7f8bd843431e",
- "sha256:cd410a1cbb2d297c67d8521759ab2ee3f1d66206d2e4328502a487589a2cb21b",
- "sha256:ce298e3d0c65bd03fa65ffcc6db0e2b578e8f626d468db64fdf8457731052942",
- "sha256:d5ca078bb666c4a9d1287a379fe617a6dccd18c3e8a7e6c7e1eb8974330c626a",
- "sha256:d5fd67df77bab0d3f4ea1d7afca9ef15c2ee35dfb348c7b57ffb9782a6e4db6e",
- "sha256:da1a90c1ddb7531b1d5ff1e171b4ee61f6345119be7351104b67ff413843fe94",
- "sha256:dba70f30fd81f8ce6d32ddeef37d91c8948e5d5a4c63242d16a2b2df8143aafc",
- "sha256:dc07f021ee80510f3cd3af2cad5b6a3b3a10b057521d9e6aaeb621730d320c5a",
- "sha256:dd33eb9bdcfbabab3459c9ee651d94c842bc8a05fabc95edf4ee0c15a072495e",
- "sha256:e0538c43565ee6e703d3a7c3bdfe4037a5209250e8502c98f20fea6f5fdf2965",
- "sha256:e1f54b9b4b6c53369f40028d2dd07a8c374583417ee6ec0ea304e710a20f80a0",
- "sha256:e32d2a2b02ccbef10145df9135751abea1f9f076e67a4e261b05f24b94219e36",
- "sha256:e6096b0688e6e14af6a1b10eaad86b4ff17935c49aa774eac7c95a57a4e8c296",
- "sha256:e71255ba42567d34a13c03968736c5d39bb4a97ce98188fafb27ce981115beec",
- "sha256:ed2e07c6a26ed4bea91b897ee2b0835c21716d9a469a96c3e878dc5f8c55bb23",
- "sha256:eef2afb0fd1747f33f1ee3e209bce1ed582d1896b240ccc5e2697e3275f037c7",
- "sha256:f23222527b307970e383433daec128d769ff778d9b29343fb3496472dc20dabe",
- "sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6",
- "sha256:f5be7805e53dafe94d295399cfbe5227f39995a997f4fd8539bf3cbdc8f47ca8",
- "sha256:f7f325be2804246a75a4f45c72d4ce80d2443ab815063cdf70ee8fb2ca59ee1b",
- "sha256:f8af619e3be812a2059b212064ea7a640aff0568d972cd1b9e920837469eb3cb",
- "sha256:fa8c626d6441e2d04b6ee703ef2d1e17608ad44c7cb75258c09dd42bacdfc64b",
- "sha256:fbb9dc00e39f3e6c0ef48edee202f9520dafb233e8b51b06b8428cfcb92abd30",
- "sha256:fff55f3ce50a3ff63ec8e2a8d3dd924f1941b250b0aac3d3d42b687eeff07a8e"
- ],
- "version": "==2021.11.10"
+ "sha256:052b670fafbe30966bbe5d025e90b2a491f85dfe5b2583a163b5e60a85a321ad",
+ "sha256:0653d012b3bf45f194e5e6a41df9258811ac8fc395579fa82958a8b76286bea4",
+ "sha256:0a069c8483466806ab94ea9068c34b200b8bfc66b6762f45a831c4baaa9e8cdd",
+ "sha256:0cf0da36a212978be2c2e2e2d04bdff46f850108fccc1851332bcae51c8907cc",
+ "sha256:131d4be09bea7ce2577f9623e415cab287a3c8e0624f778c1d955ec7c281bd4d",
+ "sha256:144486e029793a733e43b2e37df16a16df4ceb62102636ff3db6033994711066",
+ "sha256:1ddf14031a3882f684b8642cb74eea3af93a2be68893901b2b387c5fd92a03ec",
+ "sha256:1eba476b1b242620c266edf6325b443a2e22b633217a9835a52d8da2b5c051f9",
+ "sha256:20f61c9944f0be2dc2b75689ba409938c14876c19d02f7585af4460b6a21403e",
+ "sha256:22960019a842777a9fa5134c2364efaed5fbf9610ddc5c904bd3a400973b0eb8",
+ "sha256:22e7ebc231d28393dfdc19b185d97e14a0f178bedd78e85aad660e93b646604e",
+ "sha256:23cbb932cc53a86ebde0fb72e7e645f9a5eec1a5af7aa9ce333e46286caef783",
+ "sha256:29c04741b9ae13d1e94cf93fca257730b97ce6ea64cfe1eba11cf9ac4e85afb6",
+ "sha256:2bde29cc44fa81c0a0c8686992c3080b37c488df167a371500b2a43ce9f026d1",
+ "sha256:2cdc55ca07b4e70dda898d2ab7150ecf17c990076d3acd7a5f3b25cb23a69f1c",
+ "sha256:370f6e97d02bf2dd20d7468ce4f38e173a124e769762d00beadec3bc2f4b3bc4",
+ "sha256:395161bbdbd04a8333b9ff9763a05e9ceb4fe210e3c7690f5e68cedd3d65d8e1",
+ "sha256:44136355e2f5e06bf6b23d337a75386371ba742ffa771440b85bed367c1318d1",
+ "sha256:44a6c2f6374e0033873e9ed577a54a3602b4f609867794c1a3ebba65e4c93ee7",
+ "sha256:4919899577ba37f505aaebdf6e7dc812d55e8f097331312db7f1aab18767cce8",
+ "sha256:4b4b1fe58cd102d75ef0552cf17242705ce0759f9695334a56644ad2d83903fe",
+ "sha256:4bdd56ee719a8f751cf5a593476a441c4e56c9b64dc1f0f30902858c4ef8771d",
+ "sha256:4bf41b8b0a80708f7e0384519795e80dcb44d7199a35d52c15cc674d10b3081b",
+ "sha256:4cac3405d8dda8bc6ed499557625585544dd5cbf32072dcc72b5a176cb1271c8",
+ "sha256:4fe7fda2fe7c8890d454f2cbc91d6c01baf206fbc96d89a80241a02985118c0c",
+ "sha256:50921c140561d3db2ab9f5b11c5184846cde686bb5a9dc64cae442926e86f3af",
+ "sha256:5217c25229b6a85049416a5c1e6451e9060a1edcf988641e309dbe3ab26d3e49",
+ "sha256:5352bea8a8f84b89d45ccc503f390a6be77917932b1c98c4cdc3565137acc714",
+ "sha256:542e3e306d1669b25936b64917285cdffcd4f5c6f0247636fec037187bd93542",
+ "sha256:543883e3496c8b6d58bd036c99486c3c8387c2fc01f7a342b760c1ea3158a318",
+ "sha256:586b36ebda81e6c1a9c5a5d0bfdc236399ba6595e1397842fd4a45648c30f35e",
+ "sha256:597f899f4ed42a38df7b0e46714880fb4e19a25c2f66e5c908805466721760f5",
+ "sha256:5a260758454580f11dd8743fa98319bb046037dfab4f7828008909d0aa5292bc",
+ "sha256:5aefb84a301327ad115e9d346c8e2760009131d9d4b4c6b213648d02e2abe144",
+ "sha256:5e6a5567078b3eaed93558842346c9d678e116ab0135e22eb72db8325e90b453",
+ "sha256:5ff525698de226c0ca743bfa71fc6b378cda2ddcf0d22d7c37b1cc925c9650a5",
+ "sha256:61edbca89aa3f5ef7ecac8c23d975fe7261c12665f1d90a6b1af527bba86ce61",
+ "sha256:659175b2144d199560d99a8d13b2228b85e6019b6e09e556209dfb8c37b78a11",
+ "sha256:6a9a19bea8495bb419dc5d38c4519567781cd8d571c72efc6aa959473d10221a",
+ "sha256:6b30bddd61d2a3261f025ad0f9ee2586988c6a00c780a2fb0a92cea2aa702c54",
+ "sha256:6ffd55b5aedc6f25fd8d9f905c9376ca44fcf768673ffb9d160dd6f409bfda73",
+ "sha256:702d8fc6f25bbf412ee706bd73019da5e44a8400861dfff7ff31eb5b4a1276dc",
+ "sha256:74bcab50a13960f2a610cdcd066e25f1fd59e23b69637c92ad470784a51b1347",
+ "sha256:75f591b2055523fc02a4bbe598aa867df9e953255f0b7f7715d2a36a9c30065c",
+ "sha256:763b64853b0a8f4f9cfb41a76a4a85a9bcda7fdda5cb057016e7706fde928e66",
+ "sha256:76c598ca73ec73a2f568e2a72ba46c3b6c8690ad9a07092b18e48ceb936e9f0c",
+ "sha256:78d680ef3e4d405f36f0d6d1ea54e740366f061645930072d39bca16a10d8c93",
+ "sha256:7b280948d00bd3973c1998f92e22aa3ecb76682e3a4255f33e1020bd32adf443",
+ "sha256:7db345956ecce0c99b97b042b4ca7326feeec6b75facd8390af73b18e2650ffc",
+ "sha256:7dbdce0c534bbf52274b94768b3498abdf675a691fec5f751b6057b3030f34c1",
+ "sha256:7ef6b5942e6bfc5706301a18a62300c60db9af7f6368042227ccb7eeb22d0892",
+ "sha256:7f5a3ffc731494f1a57bd91c47dc483a1e10048131ffb52d901bfe2beb6102e8",
+ "sha256:8a45b6514861916c429e6059a55cf7db74670eaed2052a648e3e4d04f070e001",
+ "sha256:8ad241da7fac963d7573cc67a064c57c58766b62a9a20c452ca1f21050868dfa",
+ "sha256:8b0886885f7323beea6f552c28bff62cbe0983b9fbb94126531693ea6c5ebb90",
+ "sha256:8ca88da1bd78990b536c4a7765f719803eb4f8f9971cc22d6ca965c10a7f2c4c",
+ "sha256:8e0caeff18b96ea90fc0eb6e3bdb2b10ab5b01a95128dfeccb64a7238decf5f0",
+ "sha256:957403a978e10fb3ca42572a23e6f7badff39aa1ce2f4ade68ee452dc6807692",
+ "sha256:9af69f6746120998cd9c355e9c3c6aec7dff70d47247188feb4f829502be8ab4",
+ "sha256:9c94f7cc91ab16b36ba5ce476f1904c91d6c92441f01cd61a8e2729442d6fcf5",
+ "sha256:a37d51fa9a00d265cf73f3de3930fa9c41548177ba4f0faf76e61d512c774690",
+ "sha256:a3a98921da9a1bf8457aeee6a551948a83601689e5ecdd736894ea9bbec77e83",
+ "sha256:a3c1ebd4ed8e76e886507c9eddb1a891673686c813adf889b864a17fafcf6d66",
+ "sha256:a5f9505efd574d1e5b4a76ac9dd92a12acb2b309551e9aa874c13c11caefbe4f",
+ "sha256:a8ff454ef0bb061e37df03557afda9d785c905dab15584860f982e88be73015f",
+ "sha256:a9d0b68ac1743964755ae2d89772c7e6fb0118acd4d0b7464eaf3921c6b49dd4",
+ "sha256:aa62a07ac93b7cb6b7d0389d8ef57ffc321d78f60c037b19dfa78d6b17c928ee",
+ "sha256:ac741bf78b9bb432e2d314439275235f41656e189856b11fb4e774d9f7246d81",
+ "sha256:ae1e96785696b543394a4e3f15f3f225d44f3c55dafe3f206493031419fedf95",
+ "sha256:b683e5fd7f74fb66e89a1ed16076dbab3f8e9f34c18b1979ded614fe10cdc4d9",
+ "sha256:b7a8b43ee64ca8f4befa2bea4083f7c52c92864d8518244bfa6e88c751fa8fff",
+ "sha256:b8e38472739028e5f2c3a4aded0ab7eadc447f0d84f310c7a8bb697ec417229e",
+ "sha256:bfff48c7bd23c6e2aec6454aaf6edc44444b229e94743b34bdcdda2e35126cf5",
+ "sha256:c14b63c9d7bab795d17392c7c1f9aaabbffd4cf4387725a0ac69109fb3b550c6",
+ "sha256:c27cc1e4b197092e50ddbf0118c788d9977f3f8f35bfbbd3e76c1846a3443df7",
+ "sha256:c28d3309ebd6d6b2cf82969b5179bed5fefe6142c70f354ece94324fa11bf6a1",
+ "sha256:c670f4773f2f6f1957ff8a3962c7dd12e4be54d05839b216cb7fd70b5a1df394",
+ "sha256:ce6910b56b700bea7be82c54ddf2e0ed792a577dfaa4a76b9af07d550af435c6",
+ "sha256:d0213671691e341f6849bf33cd9fad21f7b1cb88b89e024f33370733fec58742",
+ "sha256:d03fe67b2325cb3f09be029fd5da8df9e6974f0cde2c2ac6a79d2634e791dd57",
+ "sha256:d0e5af9a9effb88535a472e19169e09ce750c3d442fb222254a276d77808620b",
+ "sha256:d243b36fbf3d73c25e48014961e83c19c9cc92530516ce3c43050ea6276a2ab7",
+ "sha256:d26166acf62f731f50bdd885b04b38828436d74e8e362bfcb8df221d868b5d9b",
+ "sha256:d403d781b0e06d2922435ce3b8d2376579f0c217ae491e273bab8d092727d244",
+ "sha256:d8716f82502997b3d0895d1c64c3b834181b1eaca28f3f6336a71777e437c2af",
+ "sha256:e4f781ffedd17b0b834c8731b75cce2639d5a8afe961c1e58ee7f1f20b3af185",
+ "sha256:e613a98ead2005c4ce037c7b061f2409a1a4e45099edb0ef3200ee26ed2a69a8",
+ "sha256:ef4163770525257876f10e8ece1cf25b71468316f61451ded1a6f44273eedeb5"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.10.31"
},
"requests": {
"hashes": [
- "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
- "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.26.0"
+ "version": "==2.28.1"
},
"requests-file": {
"hashes": [
@@ -1010,16 +1112,18 @@
},
"s3transfer": {
"hashes": [
- "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c",
- "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"
+ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
+ "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
],
- "version": "==0.5.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==0.6.0"
},
"schedule": {
"hashes": [
"sha256:617adce8b4bf38c360b781297d59918fbebfb2878f1671d189f4f4af5d0567a4",
"sha256:e6ca13585e62c810e13a08682e0a6a8ad245372e376ba2b8679294f377dfc8e4"
],
+ "markers": "python_version >= '3.6'",
"version": "==1.1.0"
},
"schema": {
@@ -1031,89 +1135,95 @@
},
"selectolax": {
"hashes": [
- "sha256:0163857504c02060ab7b501e8a5789b67e9c9d01202070908bf7f96dac652026",
- "sha256:01e08b75870b92388f933219c835a105d4995f4070c9b7a3667bcdecf9ea1ccb",
- "sha256:024030a289155a95ef38e90d56ef123ea78a906d8b84aefd14250af9a0164fce",
- "sha256:0d6d4c7e999770def0c5d7af79bf1c8225f2336f77e77729ab01d850aca2152e",
- "sha256:1aaccbd22abb0eddfd0da96294e47900992af8c6eb77cac14ae2a699fee75b07",
- "sha256:1e2b5f0509a8e444a35376adc0c1e52fb9e3db81b468049bd670c218658f42ed",
- "sha256:36d9df68c707bdde91bbdb2d12a828bc365ce27016ffdfb7c2340141732938ae",
- "sha256:39bcf60adf2c7556cc1a0d788a4944b335c218dda52ece1f2410fcce9cfb92ef",
- "sha256:3b2830fa10fd217cb1ce9793f2724723c50bbe8c60e2993b64fef21b4c70813f",
- "sha256:44598f611945880f230f4117c6836f0832e420362ce57585cffc62b15e10a544",
- "sha256:4491beace0544654b4ba84e955beea6b40388ddf9cc7487a68c302675ba7b69f",
- "sha256:4a7f48902d12b56c856fbbc1ebe6494ac48cbca4d9a3a5400744a8066e4e5856",
- "sha256:4eb4f2b0fe32985771907d3806a0670262bcd1866186e55fc6c553e22e38edaa",
- "sha256:50c2c1e1cbca3abd9788af35aad5f14906331500e089a06e6d3c7776bd972e4f",
- "sha256:542623f4d3a33735f1690a551cad0f9f56a62ced994abfbcfcfaa8f46741f6ef",
- "sha256:54fbc6858ad8e0b6fcc2d2b9e63836a3afb9631e620c868fd4b880adf877771c",
- "sha256:55a5a231c64c0f67d4e8535ebf7c449ad41425f76048e5f0b4a51546c6c32df8",
- "sha256:5af460e9074c2a3496f65e779e9b53c8040a518178c98ccbe982bc643ff1e4cb",
- "sha256:5b83c5ad549df24559f295e7420914a5c0f9c7f732eff0cb026e8cd419415acb",
- "sha256:5bea95084827b7d805069476ce3aea2d47486dc3615f31dec7108813617cc629",
- "sha256:5f624e1d0dc94dd004c18cf65b0b7d3c692f2980a681c498076ecd80f2ac7e23",
- "sha256:70c38a68825038cc923ab94cb1751796e419d2252077118b9b36bbeb5a918900",
- "sha256:73d50756df7fae309ff07caa56954d0eb62a4eaf7e5a88eb85105d82010a0346",
- "sha256:808539fc6276e86f37e044dacc5bd3b03292b9948210d9d482c1be0782c43432",
- "sha256:80f62f01ae0ac56e1d1affe5e1e893bdcc672987ba1341e0a25e2d3ce206975c",
- "sha256:85b3dac6725cc814f0b972f447cb65df7adf42f77ed1763f9b812e37acf3fa4b",
- "sha256:8a19a0b66fd610eaa5e57485e03afd196dd59c677f6ebdcc9b858060bd717814",
- "sha256:9206cffbcb87df33a74e1c7b252cf5f60953c00f6edeb640980612d91b83e7ab",
- "sha256:99701c799bbbcafab0853d2989520207f3f0f919219637b08e0267c967d58863",
- "sha256:9f35922c5bc185fc8b718beb22478b2e5b7ee2ea4242c7fe3c72b1b803610e83",
- "sha256:aa76516a76454a10011090f6106015b22ac127d5e736b095094eefa6afe1f223",
- "sha256:ac48ef1fdee1f587fc86dbc626064ec0bfcc546356d5b04685d5cf4f74129470",
- "sha256:ac4c1920e80fe7721245d0892f4127911cb02eebba31d304044a95acd8e4546e",
- "sha256:b15c52be6ed79ef60668d56688cddc08c1f2fb913e372068139731651d9b39f4",
- "sha256:bc0065e496a04050f23e3261ec223c67144ef195234962ea6d83b13019049d4a",
- "sha256:c2d8d4dabc8be649ec9e02ca5b6d208c7c8f515281128a7053aceff8615013a4",
- "sha256:c39bff0661103ab578013c2d1a31dd49bae15a0170d46140898ad0eae651ffc9",
- "sha256:c57ed42b46f7fb322e16f4a46ea6198fa669fc9e421c0fd17d5948f2db3774e3",
- "sha256:c62f4f3805e4e8f89fafefd0852a885c69781dc5431e86510b13ddd4c6dabe5d",
- "sha256:d2ef7d2a71a7e888a8df390594f0f6b9e9053d8e769f318ad84d1e5e82041479",
- "sha256:d71d7ae7bec6e856b83111f270bc783e531d86e02ad3a78a37d8591ff8f3620f",
- "sha256:db056bd2dd821def9ae962952e219f755251967d2032828371352cc6f63f3339",
- "sha256:dff782aafad28418e05a16f9db82e58fe5b628edb08f021003a1cdb06572c08d",
- "sha256:e18a6ddb2486b435189209ee99849c35095f03aeb3be05c64602cde5112bd2a2",
- "sha256:e46fbd3356469612ce0693f78ea8aa8c6611fa8f86d83c7397e1520affdcbfd0",
- "sha256:e63e5470db00200d252726f998a8ee8eb3edd40a9f8659556e7500191b9d5dac",
- "sha256:e82bc003574d3374ef13b5597c4fde94fbd767bf37ccc99066c565f25634e2fd",
- "sha256:e85cd43ad146e43a891167305553f1c8aa68e60fed8152e67b9b081fe4c865eb",
- "sha256:ee17acc5c188a8e511edbc0cfbf6d1583db750eeac42e2703458e39b90eab4a3",
- "sha256:f176b0dbeea3f8643312595634503c5b3686002287d744528318a9c18e85ec98",
- "sha256:f951d736761f67dac3fea717e0475f9c945571967d121721d64add950ab7ac92"
+ "sha256:010b008aca04be6cf9727d6f206a583d79a82d397126a101f57f117113a082bb",
+ "sha256:0878aa1ab3906831b20ad9e316a77c8401030dd388f3c1c72ba51bc08d497584",
+ "sha256:087e663c0ba6d9d79294508b0a3145079e838950a0e2fc7b8b1485da3fe24254",
+ "sha256:0a8dddd34dea642429629aae21cf940668eaa1c66ab0bcf9970d72f38676697d",
+ "sha256:14c9368f9dd224f895ef1431b1961d6e9a56fb26a95b5c04900def7b8961744c",
+ "sha256:17ac0b2b4222ba2c16852c0035dcd31d9e100544e6a5138f6e01f6b1648691b5",
+ "sha256:1ba1cd707a0d0090cffb2851ec6ccfdc334ed0c2ea08ae8705a9f6c97a997f77",
+ "sha256:1d38157e2358dacf55e782d332b41391821b2ef237e34e47ff276b2184c96542",
+ "sha256:1f1ec20cc75e1866f7758e543907da222c5d8072e580cf6814f2f142036c695f",
+ "sha256:1fa1737b7031b467d8613919503c85482a59c65ac91fe60074180e625e2533c6",
+ "sha256:221051ffe8c2950e9ebe41e08103397a7b287dca05a9e8084bb9e925f2d9c556",
+ "sha256:264918c1e9e6f6657f47116e4dbd74b57c660d3e86f9cc78209f132c56c8e9e5",
+ "sha256:2d8c7ce06bdf83d3cd2a617211eec48c875826bae54c74e56aec2635daac2f31",
+ "sha256:31fb0fbc88674b3346e379664c5837070e79b2f65eab3e29b7c43e1b4fc1137c",
+ "sha256:3600747c5072725580f8dc249a40ae123840f22edab950f43b349d356f44268b",
+ "sha256:3d65d0c57cfa1b05beb5c72d3cb566f4fdaf16e5112082f300cfa6bd94836aff",
+ "sha256:3daaf7ec54565d3f15f9ce046f6a8e469d966dc4fc879af8c7f753d37994f70e",
+ "sha256:418738a2f46beea2444a1587adb4f509bdd8e7ddffac071dba097c1a3ddb8cfc",
+ "sha256:46776ca482a76b3f522e4d8f90474716e4da51dc2823f3ecc6a2ff38ef0663b7",
+ "sha256:46bacca9e9f077ff2c5a973c05b8862425f077c58f2dca8059b992ceaca6b6de",
+ "sha256:4c5c68f0139d0928298ef5e95137996e0efb6f8db364b1470221e8710834a0ab",
+ "sha256:51c33d33e4e4eec0d9c1b6accdda5c93f4e3a00b28e99fc4ebb2b95d1d4ef885",
+ "sha256:585a75f4aff85b48d0fc8f3e9afbd1e2c05902a332982d04bab93e8e1db2e4a4",
+ "sha256:5acbe02c26b43428c2f49e8f09a81bd47be7ea969c6798cde1a23c2b33d25c79",
+ "sha256:6111ac9e5ca02b13d8e3057c1e20d6608435c64a11f92460a59951a7209c2cf3",
+ "sha256:67c32c29bc9011ed1b6fd67a961073e69d67bf60bf09f3db54d6240c034719f4",
+ "sha256:68c42af2cabecf04528dff2d0bbebbecfbafc394a5192b6a5b3e1dcd19eeb766",
+ "sha256:709b1680a16f210c43e4f3240dfc15e3312ccd43c9ea20c8e20c81470214cfc6",
+ "sha256:762e91a0ac0caa2d8731568e5b2ad0cec6fc06465a9dd89280118ced4b7e0849",
+ "sha256:7d47e489a8b0181992a3384987c854bd88211685e1c32dcdcb8746ec98dbcf7e",
+ "sha256:7ebe824763782f0e6ad2accd57d0cef3a61922b72be99ccafebe0154e9b8aef6",
+ "sha256:7f1a35be9413bcd56f225b1509740ea8999a6f7558e0f0a50a4ca80b91bf11be",
+ "sha256:81c7847ff0f3561559bd98015aa3fe0a2dfb26966156f7704f7f65339d48e81c",
+ "sha256:9246bf586afaacfdc0e6fb17806ee0d3e1736d3d13a87c8e96214596d50576b7",
+ "sha256:9baff22ae7015e8f2697d5db0804ee379d53fa6e54f1dc7e9f61ee8ccb1bdb2e",
+ "sha256:a4634d7c7e9d2eb65d0fc7fe0d88641eb413cb7250fbfc66b3b4d88d49e4c724",
+ "sha256:a7fa03253260c3351f61cef36865b27ad4585516e9ac4a77244d237bfaf37f13",
+ "sha256:abac4b7afe430dd135f148d4001b593b09c8f64fccd63b15fbb03b77735e3405",
+ "sha256:ad0cfc7f66a2863d199af819c79bfa160bcc830e0f83fd5391cdd80e545af758",
+ "sha256:adabfb5635d00da49bddef3844dc65ca3da81acd889ea7be2a74ef9456558f36",
+ "sha256:ae58e7cc282a768a68abbfa39eff895788a39658c5a235524c21b09d182b3d3a",
+ "sha256:b348074bc3a0e16e9af1a2f57e0da18f5def97e415c6435dadc68aead7ccf060",
+ "sha256:b48e4c8df2c226552ac18636c2ebe9d100ff3daa8742616687bd2cbf74a81e2f",
+ "sha256:c23d9f82aea887347151538a58b15a8dbee4261e4114705c0974dee81eb796e0",
+ "sha256:c2b589be0dd45d62ec43a6446f09919b5be809c708d8ff6a7cb86acd9150091b",
+ "sha256:d13904fc037bcebc6d79e83c0a19e64cc9d4771cd7f27b325c63d1071ec0d0f0",
+ "sha256:d3506e831b972c1eb22538b25e7c991289b72b2e028bd27b633dfbd21c1a511a",
+ "sha256:d809fbf258c28190160b3fe5d34adddb1da44ed7a2f800b7125e0fac6e940016",
+ "sha256:da688ca957d68b8072dc9658506c07326f6332ff3fe03214fec375a4ccc67f8a",
+ "sha256:e001a40b25e478f8390c3898c5852cf9a226668ba02fdc4d8e3a4788ce64207a",
+ "sha256:e805b106edac716047afc6e9e49953242207909bfbb70bf47c53f231e2d27d74",
+ "sha256:eb86cacac6ed203c386afe6704732fb05d831006c65869f15f41d15e9e72973b",
+ "sha256:f5cef3310fc41f71e8fc19d05534d100f6c02789d46041777b0bbd70961a94ec",
+ "sha256:f76b0ad63b55e45d3c02e50ca8b8ef64a500aed9a5f50818173b66949470f8e4",
+ "sha256:fad7fb68e929082e6474e1392dd433d465b06b59e26158ef67813c0c8e5b7f66",
+ "sha256:fb3b3425ee21f5098531ce80dc48d99a555b8b2300deb0ddf84b6bc503f0a848",
+ "sha256:fc53731aa81617694667d4c56d21a9e26df840a219f4b62588af80c6781ba613"
],
"index": "ia",
- "version": "==0.3.6"
+ "version": "==0.3.11"
},
"sentry-sdk": {
"extras": [],
"hashes": [
- "sha256:0db297ab32e095705c20f742c3a5dac62fe15c4318681884053d0898e5abb2f6",
- "sha256:789a11a87ca02491896e121efdd64e8fd93327b69e8f2f7d42f03e2569648e88"
+ "sha256:5bbe4b72de22f9ac1e67f2a4e6efe8fbd595bb59b7b223443f50fe5802a5551c",
+ "sha256:9f0b960694e2d8bb04db4ba6ac2a645040caef4e762c65937998ff06064f10d6"
],
"index": "ia",
- "version": "==1.5.0"
+ "version": "==1.12.1"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==1.16.0"
},
"soupsieve": {
"hashes": [
- "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb",
- "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"
+ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
+ "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
- "markers": "python_version >= '3'",
- "version": "==2.3.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.3.2.post1"
},
"surt": {
"hashes": [
- "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720",
- "sha256:5691e63b189af04aa1fb178ecce5fc7d872cc582e2b6861d4500f6d41915306a"
+ "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720"
],
"version": "==0.3.1"
},
@@ -1127,59 +1237,64 @@
"sha256:b6650f2d5392a49760064bc55d73ce3397a378ef24ded96efb516c6b8ec68c26",
"sha256:ef5b162d6fa295822dacd4fe4df1b62d8df2550795a97399a8905821b58d3702"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_version >= '2.7' and python_version < '4'",
"version": "==0.12.6"
},
"tldextract": {
"hashes": [
- "sha256:d2034c3558651f7d8fdadea83fb681050b2d662dc67a00d950326dc902029444",
- "sha256:f55e05f6bf4cc952a87d13594386d32ad2dd265630a8bdfc3df03bd60425c6b0"
+ "sha256:47aa4d8f1a4da79a44529c9a2ddc518663b25d371b805194ec5ce2a5f615ccd2",
+ "sha256:78aef13ac1459d519b457a03f1f74c1bf1c2808122a6bcc0e6840f81ba55ad73"
],
- "version": "==3.1.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.4.0"
},
"tqdm": {
"hashes": [
- "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c",
- "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"
+ "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4",
+ "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"
],
- "version": "==4.62.3"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==4.64.1"
},
"trafilatura": {
"hashes": [
- "sha256:10222e9475363091d38d4aa92349974a23363e8a98692b5f902d60bba3e53383",
- "sha256:16a7026a27843d989e5fa30105cb3c03c41f4e61ca276d171041c8014370476a"
+ "sha256:a66189e4b9d591dce648f0cc79fb52a486e679708090189bc4fcd88068f095ef",
+ "sha256:c2bc0cbac6248363d938666cbedbb067ad8aefe31667c88038135b93efd475c3"
],
"index": "ia",
- "version": "==1.0.0"
+ "version": "==1.3.0"
},
"twitter": {
"hashes": [
- "sha256:06eac7ee7f2a14ddeb680671ff07450984f6d254334f5db8dd69547dd1e179c5",
- "sha256:a56ff9575fbd50a51ce91107dcb5a4c3fd00c2ba1bcb172ce538b0948d3626e6"
+ "sha256:1d9a3e45f2c440f308a7116d3672b0d1981aba8ac41cb7f3ed270ed50693f0e0",
+ "sha256:80ddd69ae2eeb88313feedeea31bf119fd6e79541ee5b37abb9c43d233194e10"
],
- "version": "==1.19.3"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==1.19.6"
},
"typing-extensions": {
"hashes": [
- "sha256:4ca091dea149f945ec56afb48dae714f21e8692ef22a395223bcd328961b6a0e",
- "sha256:7f001e5ac290a0c0401508864c7ec868be4e701886d5b573a9528ed3973d9d3b"
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
],
- "version": "==4.0.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"tzdata": {
"hashes": [
- "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5",
- "sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21"
+ "sha256:2b88858b0e3120792a3c0635c23daf36a7d7eeeca657c323da299d2094402a0d",
+ "sha256:fe5f866eddd8b96e9fcba978f8e503c909b19ea7efda11e52e39494bad3a7bfa"
],
"markers": "python_version >= '3.6'",
- "version": "==2021.5"
+ "version": "==2022.7"
},
"tzlocal": {
"hashes": [
- "sha256:0f28015ac68a5c067210400a9197fc5d36ba9bc3f8eaf1da3cbd59acdfed9e09",
- "sha256:28ba8d9fcb6c9a782d6e0078b4f6627af1ea26aeaa32b4eab5324abc7df4149f"
+ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
+ "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
- "version": "==4.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.2"
},
"urlcanon": {
"hashes": [
@@ -1190,11 +1305,11 @@
},
"urllib3": {
"hashes": [
- "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
- "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version != '3.4'",
- "version": "==1.26.7"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"warctools": {
"hashes": [
@@ -1207,22 +1322,16 @@
"brotli"
],
"hashes": [
- "sha256:942d61217b892781568f33cc17398c979e64550d7725fc0feb49a42c19f0dbd0"
+ "sha256:3a3f149508d68ec53f5cdf434a45e5bb906beef731327d7bd2ef6b751c98281b"
],
"index": "ia",
- "version": "==0.8.1.2"
- },
- "wayback-esp": {
- "hashes": [
- "sha256:d46864aa3022fb05a5639e2faa5c1140c94eff7e8fd1944d6fb69f974067dc6e"
- ],
- "version": "==0.2.14"
+ "version": "==0.8.6.1"
},
"wayback-search-js": {
"hashes": [
- "sha256:eb377f05149f39616b9fda9cbc1d3a45bc3178e583f16d7dd3df174871494412"
+ "sha256:a474ba8da58f9cc27b1dce7f87a8cc7d119715ab4bab750dcc1d90f002074161"
],
- "version": "==3.1.5"
+ "version": "==3.1.21"
},
"wbex-client": {
"hashes": [
@@ -1239,132 +1348,93 @@
},
"werkzeug": {
"hashes": [
- "sha256:63d3dc1cf60e7b7e35e97fa9861f7397283b75d765afcaefd993d6046899de8f",
- "sha256:aa2bb6fc8dee8d6c504c0ac1e7f5f7dc5810a9903e793b6f715a9f015bdadb9a"
+ "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
+ "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c"
],
- "version": "==2.0.2"
- },
- "wrapt": {
- "hashes": [
- "sha256:086218a72ec7d986a3eddb7707c8c4526d677c7b35e355875a0fe2918b059179",
- "sha256:0877fe981fd76b183711d767500e6b3111378ed2043c145e21816ee589d91096",
- "sha256:0a017a667d1f7411816e4bf214646d0ad5b1da2c1ea13dec6c162736ff25a374",
- "sha256:0cb23d36ed03bf46b894cfec777eec754146d68429c30431c99ef28482b5c1df",
- "sha256:1fea9cd438686e6682271d36f3481a9f3636195578bab9ca3382e2f5f01fc185",
- "sha256:220a869982ea9023e163ba915077816ca439489de6d2c09089b219f4e11b6785",
- "sha256:25b1b1d5df495d82be1c9d2fad408f7ce5ca8a38085e2da41bb63c914baadff7",
- "sha256:2dded5496e8f1592ec27079b28b6ad2a1ef0b9296d270f77b8e4a3a796cf6909",
- "sha256:2ebdde19cd3c8cdf8df3fc165bc7827334bc4e353465048b36f7deeae8ee0918",
- "sha256:43e69ffe47e3609a6aec0fe723001c60c65305784d964f5007d5b4fb1bc6bf33",
- "sha256:46f7f3af321a573fc0c3586612db4decb7eb37172af1bc6173d81f5b66c2e068",
- "sha256:47f0a183743e7f71f29e4e21574ad3fa95676136f45b91afcf83f6a050914829",
- "sha256:498e6217523111d07cd67e87a791f5e9ee769f9241fcf8a379696e25806965af",
- "sha256:4b9c458732450ec42578b5642ac53e312092acf8c0bfce140ada5ca1ac556f79",
- "sha256:51799ca950cfee9396a87f4a1240622ac38973b6df5ef7a41e7f0b98797099ce",
- "sha256:5601f44a0f38fed36cc07db004f0eedeaadbdcec90e4e90509480e7e6060a5bc",
- "sha256:5f223101f21cfd41deec8ce3889dc59f88a59b409db028c469c9b20cfeefbe36",
- "sha256:610f5f83dd1e0ad40254c306f4764fcdc846641f120c3cf424ff57a19d5f7ade",
- "sha256:6a03d9917aee887690aa3f1747ce634e610f6db6f6b332b35c2dd89412912bca",
- "sha256:705e2af1f7be4707e49ced9153f8d72131090e52be9278b5dbb1498c749a1e32",
- "sha256:766b32c762e07e26f50d8a3468e3b4228b3736c805018e4b0ec8cc01ecd88125",
- "sha256:77416e6b17926d953b5c666a3cb718d5945df63ecf922af0ee576206d7033b5e",
- "sha256:778fd096ee96890c10ce96187c76b3e99b2da44e08c9e24d5652f356873f6709",
- "sha256:78dea98c81915bbf510eb6a3c9c24915e4660302937b9ae05a0947164248020f",
- "sha256:7dd215e4e8514004c8d810a73e342c536547038fb130205ec4bba9f5de35d45b",
- "sha256:7dde79d007cd6dfa65afe404766057c2409316135cb892be4b1c768e3f3a11cb",
- "sha256:81bd7c90d28a4b2e1df135bfbd7c23aee3050078ca6441bead44c42483f9ebfb",
- "sha256:85148f4225287b6a0665eef08a178c15097366d46b210574a658c1ff5b377489",
- "sha256:865c0b50003616f05858b22174c40ffc27a38e67359fa1495605f96125f76640",
- "sha256:87883690cae293541e08ba2da22cacaae0a092e0ed56bbba8d018cc486fbafbb",
- "sha256:8aab36778fa9bba1a8f06a4919556f9f8c7b33102bd71b3ab307bb3fecb21851",
- "sha256:8c73c1a2ec7c98d7eaded149f6d225a692caa1bd7b2401a14125446e9e90410d",
- "sha256:936503cb0a6ed28dbfa87e8fcd0a56458822144e9d11a49ccee6d9a8adb2ac44",
- "sha256:944b180f61f5e36c0634d3202ba8509b986b5fbaf57db3e94df11abee244ba13",
- "sha256:96b81ae75591a795d8c90edc0bfaab44d3d41ffc1aae4d994c5aa21d9b8e19a2",
- "sha256:981da26722bebb9247a0601e2922cedf8bb7a600e89c852d063313102de6f2cb",
- "sha256:ae9de71eb60940e58207f8e71fe113c639da42adb02fb2bcbcaccc1ccecd092b",
- "sha256:b73d4b78807bd299b38e4598b8e7bd34ed55d480160d2e7fdaabd9931afa65f9",
- "sha256:d4a5f6146cfa5c7ba0134249665acd322a70d1ea61732723c7d3e8cc0fa80755",
- "sha256:dd91006848eb55af2159375134d724032a2d1d13bcc6f81cd8d3ed9f2b8e846c",
- "sha256:e05e60ff3b2b0342153be4d1b597bbcfd8330890056b9619f4ad6b8d5c96a81a",
- "sha256:e6906d6f48437dfd80464f7d7af1740eadc572b9f7a4301e7dd3d65db285cacf",
- "sha256:e92d0d4fa68ea0c02d39f1e2f9cb5bc4b4a71e8c442207433d8db47ee79d7aa3",
- "sha256:e94b7d9deaa4cc7bac9198a58a7240aaf87fe56c6277ee25fa5b3aa1edebd229",
- "sha256:ea3e746e29d4000cd98d572f3ee2a6050a4f784bb536f4ac1f035987fc1ed83e",
- "sha256:ec7e20258ecc5174029a0f391e1b948bf2906cd64c198a9b8b281b811cbc04de",
- "sha256:ec9465dd69d5657b5d2fa6133b3e1e989ae27d29471a672416fd729b429eb554",
- "sha256:f122ccd12fdc69628786d0c947bdd9cb2733be8f800d88b5a37c57f1f1d73c10",
- "sha256:f99c0489258086308aad4ae57da9e8ecf9e1f3f30fa35d5e170b4d4896554d80",
- "sha256:f9c51d9af9abb899bd34ace878fbec8bf357b3194a10c4e8e0a25512826ef056",
- "sha256:fd76c47f20984b43d93de9a82011bb6e5f8325df6c9ed4d8310029a55fa361ea"
- ],
- "version": "==1.13.3"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.0.3"
},
"zstandard": {
"hashes": [
- "sha256:066488e721ec882485a500c216302b443f2eaef39356f7c65130e76c671e3ce2",
- "sha256:08a728715858f1477239887ba3c692bc462b2c86e7a8e467dc5affa7bba9093f",
- "sha256:11216b47c62e9fc71a25f4b42f525a81da268071bdb434bc1e642ffc38a24a02",
- "sha256:127c4c93f578d9b509732c74ed9b44b23e94041ba11b13827be0a7d2e3869b39",
- "sha256:12dddee2574b00c262270cfb46bd0c048e92208b95fdd39ad2a9eac1cef30498",
- "sha256:1bdda52224043e13ed20f847e3b308de1c9372d1563824fad776b1cf1f847ef0",
- "sha256:2e31680d1bcf85e7a58a45df7365af894402ae77a9868c751dc991dd13099a5f",
- "sha256:42992e89b250fe6878c175119af529775d4be7967cd9de86990145d615d6a444",
- "sha256:453e42af96923582ddbf3acf843f55d2dc534a3f7b345003852dd522aa51eae6",
- "sha256:4d8a296dab7f8f5d53acc693a6785751f43ca39b51c8eabc672f978306fb40e6",
- "sha256:5251ac352d8350869c404a0ca94457da018b726f692f6456ec82bbf907fbc956",
- "sha256:57a6cfc34d906d514358769ed6d510b312be1cf033aafb5db44865a6717579bd",
- "sha256:6ed51162e270b9b8097dcae6f2c239ada05ec112194633193ec3241498988924",
- "sha256:74cbea966462afed5a89eb99e4577538d10d425e05bf6240a75c086d59ccaf89",
- "sha256:87bea44ad24c15cd872263c0d5f912186a4be3db361eab3b25f1a61dcb5ca014",
- "sha256:8a745862ed525eee4e28bdbd58bf3ea952bf9da3c31bb4e4ce11ef15aea5c625",
- "sha256:8b760fc8118b1a0aa1d8f4e2012622e8f5f178d4b8cb94f8c6d2948b6a49a485",
- "sha256:8c8c0e813b67de1c9d7f2760768c4ae53f011c75ace18d5cff4fb40d2173763f",
- "sha256:8d5fe983e23b05f0e924fe8d0dd3935f0c9fd3266e4c6ff8621c12c350da299d",
- "sha256:8f5785c0b9b71d49d789240ae16a636728596631cf100f32b963a6f9857af5a4",
- "sha256:91efd5ea5fb3c347e7ebb6d5622bfa37d72594a2dec37c5dde70b691edb6cc03",
- "sha256:92e6c1a656390176d51125847f2f422f9d8ed468c24b63958f6ee50d9aa98c83",
- "sha256:9bcbfe1ec89789239f63daeea8778488cb5ba9034a374d7753815935f83dad65",
- "sha256:a92aa26789f17ca3b1f45cc7e728597165e2b166b99d1204bb397a672edee761",
- "sha256:a9ec6de2c058e611e9dfe88d9809a5676bc1d2a53543c1273a90a60e41b8f43c",
- "sha256:ac5d97f9dece91a1162f651da79b735c5cde4d5863477785962aad648b592446",
- "sha256:ae19628886d994ac1f3d2fc7f9ed5bb551d81000f7b4e0c57a0e88301aea2766",
- "sha256:b2ea1937eff0ed5621876dc377933fe76624abfb2ab5b418995f43af6bac50de",
- "sha256:b46220bef7bf9271a2a05512e86acbabc86cca08bebde8447bdbb4acb3179447",
- "sha256:b61586b0ff55c4137e512f1e9df4e4d7a6e1e9df782b4b87652df27737c90cc1",
- "sha256:be68fbac1e88f0dbe033a2d2e3aaaf9c8307730b905f3cd3c698ca4b904f0702",
- "sha256:c75557d53bb2d064521ff20cce9b8a51ee8301e031b1d6bcedb6458dda3bc85d",
- "sha256:c7e6b6ad58ae6f77872da9376ef0ecbf8c1ae7a0c8fc29a2473abc90f79a9a1b",
- "sha256:c8828f4e78774a6c0b8d21e59677f8f48d2e17fe2ef72793c94c10abc032c41c",
- "sha256:cae9bfcb9148152f8bfb9163b4b779326ca39fe9889e45e0572c56d25d5021be",
- "sha256:ce61492764d0442ca1e81d38d7bf7847d7df5003bce28089bab64c0519749351",
- "sha256:d40447f4a44b442fa6715779ff49a1e319729d829198279927d18bca0d7ac32d",
- "sha256:d9946cfe54bf3365f14a5aa233eb2425de3b77eac6a4c7d03dda7dbb6acd3267",
- "sha256:dd5a2287893e52204e4ce9d0e1bcea6240661dbb412efb53d5446b881d3c10a2",
- "sha256:e9456492eb13249841e53221e742bef93f4868122bfc26bafa12a07677619732",
- "sha256:eaae2d3e8fdf8bfe269628385087e4b648beef85bb0c187644e7df4fb0fe9046",
- "sha256:eba125d3899f2003debf97019cd6f46f841a405df067da23d11443ad17952a40",
- "sha256:ef759c1dfe78aa5a01747d3465d2585de14e08fc2b0195ce3f31f45477fc5a72",
- "sha256:ffe1d24c5e11e98e4c5f96f846cdd19619d8c7e5e8e5082bed62d39baa30cecb"
+ "sha256:04c298d381a3b6274b0a8001f0da0ec7819d052ad9c3b0863fe8c7f154061f76",
+ "sha256:0fde1c56ec118940974e726c2a27e5b54e71e16c6f81d0b4722112b91d2d9009",
+ "sha256:126aa8433773efad0871f624339c7984a9c43913952f77d5abeee7f95a0c0860",
+ "sha256:1a4fb8b4ac6772e4d656103ccaf2e43e45bd16b5da324b963d58ef360d09eb73",
+ "sha256:2e4812720582d0803e84aefa2ac48ce1e1e6e200ca3ce1ae2be6d410c1d637ae",
+ "sha256:2f01b27d0b453f07cbcff01405cdd007e71f5d6410eb01303a16ba19213e58e4",
+ "sha256:31d12fcd942dd8dbf52ca5f6b1bbe287f44e5d551a081a983ff3ea2082867863",
+ "sha256:3c927b6aa682c6d96225e1c797f4a5d0b9f777b327dea912b23471aaf5385376",
+ "sha256:3d5bb598963ac1f1f5b72dd006adb46ca6203e4fb7269a5b6e1f99e85b07ad38",
+ "sha256:401508efe02341ae681752a87e8ac9ef76df85ef1a238a7a21786a489d2c983d",
+ "sha256:4514b19abe6dbd36d6c5d75c54faca24b1ceb3999193c5b1f4b685abeabde3d0",
+ "sha256:47dfa52bed3097c705451bafd56dac26535545a987b6759fa39da1602349d7ba",
+ "sha256:4fa496d2d674c6e9cffc561639d17009d29adee84a27cf1e12d3c9be14aa8feb",
+ "sha256:55a513ec67e85abd8b8b83af8813368036f03e2d29a50fc94033504918273980",
+ "sha256:55b3187e0bed004533149882ef8c24e954321f3be81f8a9ceffe35099b82a0d0",
+ "sha256:593f96718ad906e24d6534187fdade28b611f8ed06e27ba972ba48aecec45fc6",
+ "sha256:5e21032efe673b887464667d09406bab6e16d96b09ad87e80859e3a20b6745b6",
+ "sha256:60a86b7b2b1c300779167cf595e019e61afcc0e20c4838692983a921db9006ac",
+ "sha256:619f9bf37cdb4c3dc9d4120d2a1003f5db9446f3618a323219f408f6a9df6725",
+ "sha256:660b91eca10ee1b44c47843894abe3e6cfd80e50c90dee3123befbf7ca486bd3",
+ "sha256:67710d220af405f5ce22712fa741d85e8b3ada7a457ea419b038469ba379837c",
+ "sha256:6caed86cd47ae93915d9031dc04be5283c275e1a2af2ceff33932071f3eeff4d",
+ "sha256:6d2182e648e79213b3881998b30225b3f4b1f3e681f1c1eaf4cacf19bde1040d",
+ "sha256:72758c9f785831d9d744af282d54c3e0f9db34f7eae521c33798695464993da2",
+ "sha256:74c2637d12eaacb503b0b06efdf55199a11b1d7c580bd3dd9dfe84cac97ef2f6",
+ "sha256:755020d5aeb1b10bffd93d119e7709a2a7475b6ad79c8d5226cea3f76d152ce0",
+ "sha256:7ccc4727300f223184520a6064c161a90b5d0283accd72d1455bcd85ec44dd0d",
+ "sha256:81ab21d03e3b0351847a86a0b298b297fde1e152752614138021d6d16a476ea6",
+ "sha256:8371217dff635cfc0220db2720fc3ce728cd47e72bb7572cca035332823dbdfc",
+ "sha256:876567136b0359f6581ecd892bdb4ca03a0eead0265db73206c78cff03bcdb0f",
+ "sha256:879411d04068bd489db57dcf6b82ffad3c5fb2a1fdd30817c566d8b7bedee442",
+ "sha256:898500957ae5e7f31b7271ace4e6f3625b38c0ac84e8cedde8de3a77a7fdae5e",
+ "sha256:8c9ca56345b0c5574db47560603de9d05f63cce5dfeb3a456eb60f3fec737ff2",
+ "sha256:8ec2c146e10b59c376b6bc0369929647fcd95404a503a7aa0990f21c16462248",
+ "sha256:8f7c68de4f362c1b2f426395fe4e05028c56d0782b2ec3ae18a5416eaf775576",
+ "sha256:909bdd4e19ea437eb9b45d6695d722f6f0fd9d8f493e837d70f92062b9f39faf",
+ "sha256:9d97c713433087ba5cee61a3e8edb54029753d45a4288ad61a176fa4718033ce",
+ "sha256:a65e0119ad39e855427520f7829618f78eb2824aa05e63ff19b466080cd99210",
+ "sha256:aa9087571729c968cd853d54b3f6e9d0ec61e45cd2c31e0eb8a0d4bdbbe6da2f",
+ "sha256:aef0889417eda2db000d791f9739f5cecb9ccdd45c98f82c6be531bdc67ff0f2",
+ "sha256:b253d0c53c8ee12c3e53d181fb9ef6ce2cd9c41cbca1c56a535e4fc8ec41e241",
+ "sha256:b80f6f6478f9d4ca26daee6c61584499493bf97950cfaa1a02b16bb5c2c17e70",
+ "sha256:be6329b5ba18ec5d32dc26181e0148e423347ed936dda48bf49fb243895d1566",
+ "sha256:c7560f622e3849cc8f3e999791a915addd08fafe80b47fcf3ffbda5b5151047c",
+ "sha256:d1a7a716bb04b1c3c4a707e38e2dee46ac544fff931e66d7ae944f3019fc55b8",
+ "sha256:d63b04e16df8ea21dfcedbf5a60e11cbba9d835d44cb3cbff233cfd037a916d5",
+ "sha256:d777d239036815e9b3a093fa9208ad314c040c26d7246617e70e23025b60083a",
+ "sha256:e892d3177380ec080550b56a7ffeab680af25575d291766bdd875147ba246a91",
+ "sha256:e9c90a44470f2999779057aeaf33461cbd8bb59d8f15e983150d10bb260e16e0",
+ "sha256:f097dda5d4f9b9b01b3c9fa2069f9c02929365f48f341feddf3d6b32510a2f93",
+ "sha256:f4ebfe03cbae821ef994b2e58e4df6a087470cc522aca502614e82a143365d45"
],
"index": "ia",
- "version": "==0.16.0"
+ "version": "==0.19.0"
}
},
"develop": {
"astroid": {
"hashes": [
- "sha256:5939cf55de24b92bda00345d4d0659d01b3c7dafb5055165c330bc7c568ba273",
- "sha256:776ca0b748b4ad69c00bfe0fff38fa2d21c338e12c84aa9715ee0d473c422778"
+ "sha256:10e0ad5f7b79c435179d0d0f0df69998c4eef4597534aae44910db060baeb907",
+ "sha256:1493fe8bd3dfd73dc35bd53c9d5b6e49ead98497c47b2307662556a5692d29d7"
],
- "version": "==2.9.0"
+ "markers": "python_full_version >= '3.7.2'",
+ "version": "==2.12.13"
+ },
+ "asttokens": {
+ "hashes": [
+ "sha256:4622110b2a6f30b77e1473affaa97e711bc2f07d3f10848420ff1898edbe94f3",
+ "sha256:6b0ac9e93fb0335014d382b8fa9b3afa7df546984258005da0b9e7095b3deb1c"
+ ],
+ "version": "==2.2.1"
},
"attrs": {
"hashes": [
- "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
- "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
+ "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836",
+ "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"
],
- "version": "==21.2.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==22.2.0"
},
"backcall": {
"hashes": [
@@ -1375,118 +1445,150 @@
},
"black": {
"hashes": [
- "sha256:380f1b5da05e5a1429225676655dddb96f5ae8c75bdf91e53d798871b902a115",
- "sha256:7de4cfc7eb6b710de325712d40125689101d21d25283eed7e9998722cf10eb91"
+ "sha256:101c69b23df9b44247bd88e1d7e90154336ac4992502d4197bdac35dd7ee3320",
+ "sha256:159a46a4947f73387b4d83e87ea006dbb2337eab6c879620a3ba52699b1f4351",
+ "sha256:1f58cbe16dfe8c12b7434e50ff889fa479072096d79f0a7f25e4ab8e94cd8350",
+ "sha256:229351e5a18ca30f447bf724d007f890f97e13af070bb6ad4c0a441cd7596a2f",
+ "sha256:436cc9167dd28040ad90d3b404aec22cedf24a6e4d7de221bec2730ec0c97bcf",
+ "sha256:559c7a1ba9a006226f09e4916060982fd27334ae1998e7a38b3f33a37f7a2148",
+ "sha256:7412e75863aa5c5411886804678b7d083c7c28421210180d67dfd8cf1221e1f4",
+ "sha256:77d86c9f3db9b1bf6761244bc0b3572a546f5fe37917a044e02f3166d5aafa7d",
+ "sha256:82d9fe8fee3401e02e79767016b4907820a7dc28d70d137eb397b92ef3cc5bfc",
+ "sha256:9eedd20838bd5d75b80c9f5487dbcb06836a43833a37846cf1d8c1cc01cef59d",
+ "sha256:c116eed0efb9ff870ded8b62fe9f28dd61ef6e9ddd28d83d7d264a38417dcee2",
+ "sha256:d30b212bffeb1e252b31dd269dfae69dd17e06d92b87ad26e23890f3efea366f"
],
"index": "ia",
- "version": "==21.9b0"
+ "version": "==22.12.0"
},
"certifi": {
"hashes": [
- "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
- "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2021.10.8"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
"charset-normalizer": {
"hashes": [
- "sha256:1eecaa09422db5be9e29d7fc65664e6c33bd06f9ced7838578ba40d58bdf3721",
- "sha256:b0b883e8e874edfdece9c28f314e3dd5badf067342e42fb162203335ae61aa2c"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "markers": "python_version >= '3'",
- "version": "==2.0.9"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
"click": {
"hashes": [
- "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3",
- "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"
+ "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e",
+ "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"
],
- "version": "==8.0.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==8.1.3"
},
"coverage": {
"extras": [
"toml"
],
"hashes": [
- "sha256:01774a2c2c729619760320270e42cd9e797427ecfddd32c2a7b639cdc481f3c0",
- "sha256:03b20e52b7d31be571c9c06b74746746d4eb82fc260e594dc662ed48145e9efd",
- "sha256:0a7726f74ff63f41e95ed3a89fef002916c828bb5fcae83b505b49d81a066884",
- "sha256:1219d760ccfafc03c0822ae2e06e3b1248a8e6d1a70928966bafc6838d3c9e48",
- "sha256:13362889b2d46e8d9f97c421539c97c963e34031ab0cb89e8ca83a10cc71ac76",
- "sha256:174cf9b4bef0db2e8244f82059a5a72bd47e1d40e71c68ab055425172b16b7d0",
- "sha256:17e6c11038d4ed6e8af1407d9e89a2904d573be29d51515f14262d7f10ef0a64",
- "sha256:215f8afcc02a24c2d9a10d3790b21054b58d71f4b3c6f055d4bb1b15cecce685",
- "sha256:22e60a3ca5acba37d1d4a2ee66e051f5b0e1b9ac950b5b0cf4aa5366eda41d47",
- "sha256:2641f803ee9f95b1f387f3e8f3bf28d83d9b69a39e9911e5bfee832bea75240d",
- "sha256:276651978c94a8c5672ea60a2656e95a3cce2a3f31e9fb2d5ebd4c215d095840",
- "sha256:3f7c17209eef285c86f819ff04a6d4cbee9b33ef05cbcaae4c0b4e8e06b3ec8f",
- "sha256:3feac4084291642165c3a0d9eaebedf19ffa505016c4d3db15bfe235718d4971",
- "sha256:49dbff64961bc9bdd2289a2bda6a3a5a331964ba5497f694e2cbd540d656dc1c",
- "sha256:4e547122ca2d244f7c090fe3f4b5a5861255ff66b7ab6d98f44a0222aaf8671a",
- "sha256:5829192582c0ec8ca4a2532407bc14c2f338d9878a10442f5d03804a95fac9de",
- "sha256:5d6b09c972ce9200264c35a1d53d43ca55ef61836d9ec60f0d44273a31aa9f17",
- "sha256:600617008aa82032ddeace2535626d1bc212dfff32b43989539deda63b3f36e4",
- "sha256:619346d57c7126ae49ac95b11b0dc8e36c1dd49d148477461bb66c8cf13bb521",
- "sha256:63c424e6f5b4ab1cf1e23a43b12f542b0ec2e54f99ec9f11b75382152981df57",
- "sha256:6dbc1536e105adda7a6312c778f15aaabe583b0e9a0b0a324990334fd458c94b",
- "sha256:6e1394d24d5938e561fbeaa0cd3d356207579c28bd1792f25a068743f2d5b282",
- "sha256:86f2e78b1eff847609b1ca8050c9e1fa3bd44ce755b2ec30e70f2d3ba3844644",
- "sha256:8bdfe9ff3a4ea37d17f172ac0dff1e1c383aec17a636b9b35906babc9f0f5475",
- "sha256:8e2c35a4c1f269704e90888e56f794e2d9c0262fb0c1b1c8c4ee44d9b9e77b5d",
- "sha256:92b8c845527eae547a2a6617d336adc56394050c3ed8a6918683646328fbb6da",
- "sha256:9365ed5cce5d0cf2c10afc6add145c5037d3148585b8ae0e77cc1efdd6aa2953",
- "sha256:9a29311bd6429be317c1f3fe4bc06c4c5ee45e2fa61b2a19d4d1d6111cb94af2",
- "sha256:9a2b5b52be0a8626fcbffd7e689781bf8c2ac01613e77feda93d96184949a98e",
- "sha256:a4bdeb0a52d1d04123b41d90a4390b096f3ef38eee35e11f0b22c2d031222c6c",
- "sha256:a9c8c4283e17690ff1a7427123ffb428ad6a52ed720d550e299e8291e33184dc",
- "sha256:b637c57fdb8be84e91fac60d9325a66a5981f8086c954ea2772efe28425eaf64",
- "sha256:bf154ba7ee2fd613eb541c2bc03d3d9ac667080a737449d1a3fb342740eb1a74",
- "sha256:c254b03032d5a06de049ce8bca8338a5185f07fb76600afff3c161e053d88617",
- "sha256:c332d8f8d448ded473b97fefe4a0983265af21917d8b0cdcb8bb06b2afe632c3",
- "sha256:c7912d1526299cb04c88288e148c6c87c0df600eca76efd99d84396cfe00ef1d",
- "sha256:cfd9386c1d6f13b37e05a91a8583e802f8059bebfccde61a418c5808dea6bbfa",
- "sha256:d5d2033d5db1d58ae2d62f095e1aefb6988af65b4b12cb8987af409587cc0739",
- "sha256:dca38a21e4423f3edb821292e97cec7ad38086f84313462098568baedf4331f8",
- "sha256:e2cad8093172b7d1595b4ad66f24270808658e11acf43a8f95b41276162eb5b8",
- "sha256:e3db840a4dee542e37e09f30859f1612da90e1c5239a6a2498c473183a50e781",
- "sha256:edcada2e24ed68f019175c2b2af2a8b481d3d084798b8c20d15d34f5c733fa58",
- "sha256:f467bbb837691ab5a8ca359199d3429a11a01e6dfb3d9dcc676dc035ca93c0a9",
- "sha256:f506af4f27def639ba45789fa6fde45f9a217da0be05f8910458e4557eed020c",
- "sha256:f614fc9956d76d8a88a88bb41ddc12709caa755666f580af3a688899721efecd",
- "sha256:f9afb5b746781fc2abce26193d1c817b7eb0e11459510fba65d2bd77fe161d9e",
- "sha256:fb8b8ee99b3fffe4fd86f4c81b35a6bf7e4462cba019997af2fe679365db0c49"
- ],
- "version": "==6.2"
+ "sha256:07bcfb1d8ac94af886b54e18a88b393f6a73d5959bb31e46644a02453c36e475",
+ "sha256:09f6b5a8415b6b3e136d5fec62b552972187265cb705097bf030eb9d4ffb9b60",
+ "sha256:0a79137fc99815fff6a852c233628e735ec15903cfd16da0f229d9c4d45926ab",
+ "sha256:0b4b3a4d9915b2be879aff6299c0a6129f3d08a775d5a061f503cf79571f73e4",
+ "sha256:1285648428a6101b5f41a18991c84f1c3959cee359e51b8375c5882fc364a13f",
+ "sha256:12a5aa77783d49e05439fbe6e6b427484f8a0f9f456b46a51d8aac022cfd024d",
+ "sha256:19ec666533f0f70a0993f88b8273057b96c07b9d26457b41863ccd021a043b9a",
+ "sha256:1e414dc32ee5c3f36544ea466b6f52f28a7af788653744b8570d0bf12ff34bc0",
+ "sha256:2c44fcfb3781b41409d0f060a4ed748537557de9362a8a9282182fafb7a76ab4",
+ "sha256:397b4a923cc7566bbc7ae2dfd0ba5a039b61d19c740f1373791f2ebd11caea59",
+ "sha256:3cfc595d2af13856505631be072835c59f1acf30028d1c860b435c5fc9c15b69",
+ "sha256:3dd4ee135e08037f458425b8842d24a95a0961831a33f89685ff86b77d378f89",
+ "sha256:486ee81fa694b4b796fc5617e376326a088f7b9729c74d9defa211813f3861e4",
+ "sha256:4f943a3b2bc520102dd3e0bb465e1286e12c9a54f58accd71b9e65324d9c7c01",
+ "sha256:63d56165a7c76265468d7e0c5548215a5ba515fc2cba5232d17df97bffa10f6c",
+ "sha256:66b18c3cf8bbab0cce0d7b9e4262dc830e93588986865a8c78ab2ae324b3ed56",
+ "sha256:691571f31ace1837838b7e421d3a09a8c00b4aac32efacb4fc9bd0a5c647d25a",
+ "sha256:6c5ad996c6fa4d8ed669cfa1e8551348729d008a2caf81489ab9ea67cfbc7498",
+ "sha256:6d55d840e1b8c0002fce66443e124e8581f30f9ead2e54fbf6709fb593181f2c",
+ "sha256:72d1507f152abacea81f65fee38e4ef3ac3c02ff8bc16f21d935fd3a8a4ad910",
+ "sha256:74f70cd92669394eaf8d7756d1b195c8032cf7bbbdfce3bc489d4e15b3b8cf73",
+ "sha256:830525361249dc4cd013652b0efad645a385707a5ae49350c894b67d23fbb07c",
+ "sha256:854f22fa361d1ff914c7efa347398374cc7d567bdafa48ac3aa22334650dfba2",
+ "sha256:89caf4425fe88889e2973a8e9a3f6f5f9bbe5dd411d7d521e86428c08a873a4a",
+ "sha256:9158f8fb06747ac17bd237930c4372336edc85b6e13bdc778e60f9d685c3ca37",
+ "sha256:92651580bd46519067e36493acb394ea0607b55b45bd81dd4e26379ed1871f55",
+ "sha256:978258fec36c154b5e250d356c59af7d4c3ba02bef4b99cda90b6029441d797d",
+ "sha256:9823e4789ab70f3ec88724bba1a203f2856331986cd893dedbe3e23a6cfc1e4e",
+ "sha256:9b373c9345c584bb4b5f5b8840df7f4ab48c4cbb7934b58d52c57020d911b856",
+ "sha256:a4a574a19eeb67575a5328a5760bbbb737faa685616586a9f9da4281f940109c",
+ "sha256:aec2d1515d9d39ff270059fd3afbb3b44e6ec5758af73caf18991807138c7118",
+ "sha256:b3695c4f4750bca943b3e1f74ad4be8d29e4aeab927d50772c41359107bd5d5c",
+ "sha256:b3763e7fcade2ff6c8e62340af9277f54336920489ceb6a8cd6cc96da52fcc62",
+ "sha256:b66bb21a23680dee0be66557dc6b02a3152ddb55edf9f6723fa4a93368f7158d",
+ "sha256:b6f22bb64cc39bcb883e5910f99a27b200fdc14cdd79df8696fa96b0005c9444",
+ "sha256:b77015d1cb8fe941be1222a5a8b4e3fbca88180cfa7e2d4a4e58aeabadef0ab7",
+ "sha256:b9ea158775c7c2d3e54530a92da79496fb3fb577c876eec761c23e028f1e216c",
+ "sha256:c20cfebcc149a4c212f6491a5f9ff56f41829cd4f607b5be71bb2d530ef243b1",
+ "sha256:cfded268092a84605f1cc19e5c737f9ce630a8900a3589e9289622db161967e9",
+ "sha256:d1991f1dd95eba69d2cd7708ff6c2bbd2426160ffc73c2b81f617a053ebcb1a8",
+ "sha256:d3022c3007d3267a880b5adcf18c2a9bf1fc64469b394a804886b401959b8742",
+ "sha256:d6814854c02cbcd9c873c0f3286a02e3ac1250625cca822ca6bc1018c5b19f1c",
+ "sha256:d87717959d4d0ee9db08a0f1d80d21eb585aafe30f9b0a54ecf779a69cb015f6",
+ "sha256:e00c14720b8b3b6c23b487e70bd406abafc976ddc50490f645166f111c419c39",
+ "sha256:e60bef2e2416f15fdc05772bf87db06c6a6f9870d1db08fdd019fbec98ae24a9",
+ "sha256:e78e9dcbf4f3853d3ae18a8f9272111242531535ec9e1009fa8ec4a2b74557dc",
+ "sha256:f66460f17c9319ea4f91c165d46840314f0a7c004720b20be58594d162a441d8",
+ "sha256:fa6a5a224b7f4cfb226f4fc55a57e8537fcc096f42219128c2c74c0e7d0953e1",
+ "sha256:fb992c47cb1e5bd6a01e97182400bcc2ba2077080a17fcd7be23aaa6e572e390",
+ "sha256:fd1b9c5adc066db699ccf7fa839189a649afcdd9e02cb5dc9d24e67e7922737d",
+ "sha256:fd556ff16a57a070ce4f31c635953cc44e25244f91a0378c6e9bdfd40fdb249f"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==7.0.1"
},
"decorator": {
"hashes": [
- "sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374",
- "sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
],
- "version": "==5.1.0"
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
+ },
+ "dill": {
+ "hashes": [
+ "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0",
+ "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"
+ ],
+ "markers": "python_version < '3.11'",
+ "version": "==0.3.6"
+ },
+ "executing": {
+ "hashes": [
+ "sha256:0314a69e37426e3608aada02473b4161d4caf5a4b244d1d0c48072b8fee7bacc",
+ "sha256:19da64c18d2d851112f09c287f8d3dbbdf725ab0e569077efb6cdcbd3497c107"
+ ],
+ "version": "==1.2.0"
},
"flake8": {
"hashes": [
- "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
- "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
+ "sha256:3833794e27ff64ea4e9cf5d410082a8b97ff1a06c16aa3d2027339cd0f1195c7",
+ "sha256:c61007e76655af75e6785a931f452915b371dc48f56efd765247c8fe68f2b181"
],
"index": "ia",
- "version": "==4.0.1"
+ "version": "==6.0.0"
},
"flake8-annotations": {
"hashes": [
- "sha256:3edfbbfb58e404868834fe6ec3eaf49c139f64f0701259f707d043185545151e",
- "sha256:52e53c05b0c06cac1c2dec192ea2c36e85081238add3bd99421d56f574b9479b"
+ "sha256:11f09efb99ae63c8f9d6b492b75fe147fbc323179fddfe00b2e56eefeca42f57",
+ "sha256:a4385158a7a9fc8af1d8820a2f4c8d03387997006a83f5f8bfe5bc6085bdf88a"
],
"index": "ia",
- "version": "==2.7.0"
+ "version": "==2.9.1"
},
"idna": {
"hashes": [
"sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
"sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
],
- "markers": "python_version >= '3'",
"version": "==2.6"
},
"iniconfig": {
@@ -1498,96 +1600,104 @@
},
"ipython": {
"hashes": [
- "sha256:cb6aef731bf708a7727ab6cde8df87f0281b1427d41e65d62d4b68934fa54e97",
- "sha256:fc60ef843e0863dd4e24ab2bb5698f071031332801ecf8d1aeb4fb622056545c"
+ "sha256:352042ddcb019f7c04e48171b4dd78e4c4bb67bf97030d170e154aac42b656d9",
+ "sha256:882899fe78d5417a0aa07f995db298fa28b58faeba2112d2e3a4c95fe14bb738"
],
"index": "ia",
- "version": "==7.30.1"
+ "version": "==8.7.0"
},
"isort": {
"hashes": [
- "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7",
- "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"
+ "sha256:6db30c5ded9815d813932c04c2f85a360bcdd35fed496f4d8f35495ef0a261b6",
+ "sha256:c033fd0edb91000a7f09527fe5c75321878f98322a77ddcc81adbd83724afb7b"
],
"index": "ia",
- "version": "==5.10.1"
+ "version": "==5.11.4"
},
"jedi": {
"hashes": [
- "sha256:637c9635fcf47945ceb91cd7f320234a7be540ded6f3e99a50cb6febdfd1ba8d",
- "sha256:74137626a64a99c8eb6ae5832d99b3bdd7d29a3850fe2aa80a4126b2a7d949ab"
+ "sha256:203c1fd9d969ab8f2119ec0a3342e0b49910045abe6af0a3ae83a5764d54639e",
+ "sha256:bae794c30d07f6d910d32a7048af09b5a39ed740918da923c6b780790ebac612"
],
- "version": "==0.18.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.18.2"
},
"lazy-object-proxy": {
"hashes": [
- "sha256:17e0967ba374fc24141738c69736da90e94419338fd4c7c7bef01ee26b339653",
- "sha256:1fee665d2638491f4d6e55bd483e15ef21f6c8c2095f235fef72601021e64f61",
- "sha256:22ddd618cefe54305df49e4c069fa65715be4ad0e78e8d252a33debf00f6ede2",
- "sha256:24a5045889cc2729033b3e604d496c2b6f588c754f7a62027ad4437a7ecc4837",
- "sha256:410283732af311b51b837894fa2f24f2c0039aa7f220135192b38fcc42bd43d3",
- "sha256:4732c765372bd78a2d6b2150a6e99d00a78ec963375f236979c0626b97ed8e43",
- "sha256:489000d368377571c6f982fba6497f2aa13c6d1facc40660963da62f5c379726",
- "sha256:4f60460e9f1eb632584c9685bccea152f4ac2130e299784dbaf9fae9f49891b3",
- "sha256:5743a5ab42ae40caa8421b320ebf3a998f89c85cdc8376d6b2e00bd12bd1b587",
- "sha256:85fb7608121fd5621cc4377a8961d0b32ccf84a7285b4f1d21988b2eae2868e8",
- "sha256:9698110e36e2df951c7c36b6729e96429c9c32b3331989ef19976592c5f3c77a",
- "sha256:9d397bf41caad3f489e10774667310d73cb9c4258e9aed94b9ec734b34b495fd",
- "sha256:b579f8acbf2bdd9ea200b1d5dea36abd93cabf56cf626ab9c744a432e15c815f",
- "sha256:b865b01a2e7f96db0c5d12cfea590f98d8c5ba64ad222300d93ce6ff9138bcad",
- "sha256:bf34e368e8dd976423396555078def5cfc3039ebc6fc06d1ae2c5a65eebbcde4",
- "sha256:c6938967f8528b3668622a9ed3b31d145fab161a32f5891ea7b84f6b790be05b",
- "sha256:d1c2676e3d840852a2de7c7d5d76407c772927addff8d742b9808fe0afccebdf",
- "sha256:d7124f52f3bd259f510651450e18e0fd081ed82f3c08541dffc7b94b883aa981",
- "sha256:d900d949b707778696fdf01036f58c9876a0d8bfe116e8d220cfd4b15f14e741",
- "sha256:ebfd274dcd5133e0afae738e6d9da4323c3eb021b3e13052d8cbd0e457b1256e",
- "sha256:ed361bb83436f117f9917d282a456f9e5009ea12fd6de8742d1a4752c3017e93",
- "sha256:f5144c75445ae3ca2057faac03fda5a902eff196702b0a24daf1d6ce0650514b"
- ],
- "version": "==1.6.0"
+ "sha256:0c1c7c0433154bb7c54185714c6929acc0ba04ee1b167314a779b9025517eada",
+ "sha256:14010b49a2f56ec4943b6cf925f597b534ee2fe1f0738c84b3bce0c1a11ff10d",
+ "sha256:4e2d9f764f1befd8bdc97673261b8bb888764dfdbd7a4d8f55e4fbcabb8c3fb7",
+ "sha256:4fd031589121ad46e293629b39604031d354043bb5cdf83da4e93c2d7f3389fe",
+ "sha256:5b51d6f3bfeb289dfd4e95de2ecd464cd51982fe6f00e2be1d0bf94864d58acd",
+ "sha256:6850e4aeca6d0df35bb06e05c8b934ff7c533734eb51d0ceb2d63696f1e6030c",
+ "sha256:6f593f26c470a379cf7f5bc6db6b5f1722353e7bf937b8d0d0b3fba911998858",
+ "sha256:71d9ae8a82203511a6f60ca5a1b9f8ad201cac0fc75038b2dc5fa519589c9288",
+ "sha256:7e1561626c49cb394268edd00501b289053a652ed762c58e1081224c8d881cec",
+ "sha256:8f6ce2118a90efa7f62dd38c7dbfffd42f468b180287b748626293bf12ed468f",
+ "sha256:ae032743794fba4d171b5b67310d69176287b5bf82a21f588282406a79498891",
+ "sha256:afcaa24e48bb23b3be31e329deb3f1858f1f1df86aea3d70cb5c8578bfe5261c",
+ "sha256:b70d6e7a332eb0217e7872a73926ad4fdc14f846e85ad6749ad111084e76df25",
+ "sha256:c219a00245af0f6fa4e95901ed28044544f50152840c5b6a3e7b2568db34d156",
+ "sha256:ce58b2b3734c73e68f0e30e4e725264d4d6be95818ec0a0be4bb6bf9a7e79aa8",
+ "sha256:d176f392dbbdaacccf15919c77f526edf11a34aece58b55ab58539807b85436f",
+ "sha256:e20bfa6db17a39c706d24f82df8352488d2943a3b7ce7d4c22579cb89ca8896e",
+ "sha256:eac3a9a5ef13b332c059772fd40b4b1c3d45a3a2b05e33a361dee48e54a4dad0",
+ "sha256:eb329f8d8145379bf5dbe722182410fe8863d186e51bf034d2075eb8d85ee25b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==1.8.0"
},
"matplotlib-inline": {
"hashes": [
- "sha256:a04bfba22e0d1395479f866853ec1ee28eea1485c1d69a6faf00dc3e24ff34ee",
- "sha256:aed605ba3b72462d64d475a21a9296f400a19c4f74a31b59103d2a99ffd5aa5c"
+ "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311",
+ "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"
],
- "version": "==0.1.3"
+ "markers": "python_version >= '3.5'",
+ "version": "==0.1.6"
},
"mccabe": {
"hashes": [
- "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
- "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325",
+ "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"
],
- "version": "==0.6.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.7.0"
},
"mypy": {
"hashes": [
- "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
- "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
- "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
- "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
- "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
- "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
- "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
- "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
- "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
- "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
- "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
- "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
- "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
- "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
- "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
- "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
- "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
- "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
- "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
- "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
- "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
- "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
- "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
+ "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d",
+ "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6",
+ "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf",
+ "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f",
+ "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813",
+ "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33",
+ "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad",
+ "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05",
+ "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297",
+ "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06",
+ "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd",
+ "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243",
+ "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305",
+ "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476",
+ "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711",
+ "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70",
+ "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5",
+ "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461",
+ "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab",
+ "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c",
+ "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d",
+ "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135",
+ "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93",
+ "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648",
+ "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a",
+ "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb",
+ "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3",
+ "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372",
+ "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb",
+ "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"
],
"index": "ia",
- "version": "==0.910"
+ "version": "==0.991"
},
"mypy-extensions": {
"hashes": [
@@ -1598,24 +1708,27 @@
},
"packaging": {
"hashes": [
- "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
- "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
+ "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
+ "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
],
- "version": "==21.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==22.0"
},
"parso": {
"hashes": [
"sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0",
"sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"
],
+ "markers": "python_version >= '3.6'",
"version": "==0.8.3"
},
"pathspec": {
"hashes": [
- "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a",
- "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"
+ "sha256:3c95343af8b756205e2aba76e843ba9520a24dd84f68c22b9f93251507509dd6",
+ "sha256:56200de4077d9d0791465aa9095a01d421861e405b5096955051deefd697d6f6"
],
- "version": "==0.9.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==0.10.3"
},
"pexpect": {
"hashes": [
@@ -1634,24 +1747,27 @@
},
"platformdirs": {
"hashes": [
- "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
- "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
+ "sha256:1a89a12377800c81983db6be069ec068eee989748799b946cce2a6e80dcc54ca",
+ "sha256:b46ffafa316e6b83b47489d240ce17173f123a9b9c83282141c3daf26ad9ac2e"
],
- "version": "==2.4.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==2.6.0"
},
"pluggy": {
"hashes": [
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
+ "markers": "python_version >= '3.6'",
"version": "==1.0.0"
},
"prompt-toolkit": {
"hashes": [
- "sha256:1bb05628c7d87b645974a1bad3f17612be0c29fa39af9f7688030163f680bad6",
- "sha256:e56f2ff799bacecd3e88165b1e2f5ebf9bcd59e80e06d395fa0cc4b8bd7bb506"
+ "sha256:3e163f254bef5a03b146397d7c1963bd3e2812f0964bb9a24e6ec761fd28db63",
+ "sha256:aa64ad242a462c5ff0363a7b9cfe696c20d55d9fc60c11fd8e632d064804d305"
],
- "version": "==3.0.24"
+ "markers": "python_full_version >= '3.6.2'",
+ "version": "==3.0.36"
},
"ptyprocess": {
"hashes": [
@@ -1660,48 +1776,52 @@
],
"version": "==0.7.0"
},
+ "pure-eval": {
+ "hashes": [
+ "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350",
+ "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"
+ ],
+ "version": "==0.2.2"
+ },
"py": {
"hashes": [
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pycodestyle": {
"hashes": [
- "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
- "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
+ "sha256:347187bdb476329d98f695c213d7295a846d1152ff4fe9bacb8a9590b8ee7053",
+ "sha256:8a4eaf0d0495c7395bdab3589ac2db602797d76207242c17d470186815706610"
],
- "version": "==2.8.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.10.0"
},
"pyflakes": {
"hashes": [
- "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
- "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
+ "sha256:ec55bf7fe21fff7f1ad2f7da62363d749e2a470500eab1b555334b67aa1ef8cf",
+ "sha256:ec8b276a6b60bd80defed25add7e439881c19e64850afd9b346283d4165fd0fd"
],
- "version": "==2.4.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==3.0.1"
},
"pygments": {
"hashes": [
- "sha256:b8e67fe6af78f492b3c4b3e2970c0624cbf08beb1e493b2c99b9fa1b67a20380",
- "sha256:f398865f7eb6874156579fdf36bc840a03cab64d1cde9e93d68f46a425ec52c6"
+ "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
+ "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
- "version": "==2.10.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.13.0"
},
"pylint": {
"hashes": [
- "sha256:9d945a73640e1fec07ee34b42f5669b770c759acd536ec7b16d7e4b87a9c9ff9",
- "sha256:daabda3f7ed9d1c60f52d563b1b854632fd90035bcf01443e234d3dc794e3b74"
+ "sha256:18783cca3cfee5b83c6c5d10b3cdb66c6594520ffae61890858fe8d932e1c6b4",
+ "sha256:349c8cd36aede4d50a0754a8c0218b43323d13d5d88f4b2952ddfe3e169681eb"
],
"index": "ia",
- "version": "==2.12.2"
- },
- "pyparsing": {
- "hashes": [
- "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4",
- "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"
- ],
- "version": "==3.0.6"
+ "version": "==2.15.9"
},
"pytest": {
"hashes": [
@@ -1713,212 +1833,167 @@
},
"pytest-cov": {
"hashes": [
- "sha256:578d5d15ac4a25e5f961c938b85a05b09fdaae9deef3bb6de9a6e766622ca7a6",
- "sha256:e7f0f5b1617d2210a2cabc266dfe2f4c75a8d32fb89eafb7ad9d06f6d076d470"
+ "sha256:2feb1b751d66a8bd934e5edfa2e961d11309dc37b73b0eabe73b5945fee20f6b",
+ "sha256:996b79efde6433cdbd0088872dbc5fb3ed7fe1578b68cdbba634f14bb8dd0470"
],
"index": "ia",
- "version": "==3.0.0"
+ "version": "==4.0.0"
},
"pytest-mock": {
"hashes": [
- "sha256:30c2f2cc9759e76eee674b81ea28c9f0b94f8f0445a1b87762cadf774f0df7e3",
- "sha256:40217a058c52a63f1042f0784f62009e976ba824c418cced42e88d5f40ab0e62"
+ "sha256:f4c973eeae0282963eb293eb173ce91b091a79c1334455acfac9ddee8a1c784b",
+ "sha256:fbbdb085ef7c252a326fd8cdcac0aa3b1333d8811f131bdcc701002e1be7ed4f"
],
"index": "ia",
- "version": "==3.6.1"
+ "version": "==3.10.0"
},
"pytest-pylint": {
"hashes": [
- "sha256:790c7a8019fab08e59bd3812db1657a01995a975af8b1c6ce95b9aa39d61da27",
- "sha256:b63aaf8b80ff33c8ceaa7f68323ed04102c7790093ccf6bdb261a4c2dc6fd564"
+ "sha256:b51d3f93bed9c192e2b046f16520981bee5abe7bd61b070306e7ee685219fdd3",
+ "sha256:d88e83c1023c641548a9ec3567707ceee7616632a986af133426d4a74d066932"
],
"index": "ia",
- "version": "==0.18.0"
+ "version": "==0.19.0"
},
"pytest-pythonpath": {
"hashes": [
- "sha256:63fc546ace7d2c845c1ee289e8f7a6362c2b6bae497d10c716e58e253e801d62"
+ "sha256:64e195b23a8f8c0c631fb16882d9ad6fa4137ed1f2961ddd15d52065cd435db6",
+ "sha256:e73e11dab2f0b83e73229e261242b251f0a369d7f527dbfec068822fd26a6ce5"
],
"index": "ia",
- "version": "==0.7.3"
- },
- "regex": {
- "hashes": [
- "sha256:0416f7399e918c4b0e074a0f66e5191077ee2ca32a0f99d4c187a62beb47aa05",
- "sha256:05b7d6d7e64efe309972adab77fc2af8907bb93217ec60aa9fe12a0dad35874f",
- "sha256:0617383e2fe465732af4509e61648b77cbe3aee68b6ac8c0b6fe934db90be5cc",
- "sha256:07856afef5ffcc052e7eccf3213317fbb94e4a5cd8177a2caa69c980657b3cb4",
- "sha256:0f594b96fe2e0821d026365f72ac7b4f0b487487fb3d4aaf10dd9d97d88a9737",
- "sha256:139a23d1f5d30db2cc6c7fd9c6d6497872a672db22c4ae1910be22d4f4b2068a",
- "sha256:162abfd74e88001d20cb73ceaffbfe601469923e875caf9118333b1a4aaafdc4",
- "sha256:2207ae4f64ad3af399e2d30dde66f0b36ae5c3129b52885f1bffc2f05ec505c8",
- "sha256:2409b5c9cef7054dde93a9803156b411b677affc84fca69e908b1cb2c540025d",
- "sha256:2fee3ed82a011184807d2127f1733b4f6b2ff6ec7151d83ef3477f3b96a13d03",
- "sha256:30ab804ea73972049b7a2a5c62d97687d69b5a60a67adca07eb73a0ddbc9e29f",
- "sha256:3598893bde43091ee5ca0a6ad20f08a0435e93a69255eeb5f81b85e81e329264",
- "sha256:3b5df18db1fccd66de15aa59c41e4f853b5df7550723d26aa6cb7f40e5d9da5a",
- "sha256:3c5fb32cc6077abad3bbf0323067636d93307c9fa93e072771cf9a64d1c0f3ef",
- "sha256:416c5f1a188c91e3eb41e9c8787288e707f7d2ebe66e0a6563af280d9b68478f",
- "sha256:42b50fa6666b0d50c30a990527127334d6b96dd969011e843e726a64011485da",
- "sha256:432bd15d40ed835a51617521d60d0125867f7b88acf653e4ed994a1f8e4995dc",
- "sha256:473e67837f786404570eae33c3b64a4b9635ae9f00145250851a1292f484c063",
- "sha256:4aaa4e0705ef2b73dd8e36eeb4c868f80f8393f5f4d855e94025ce7ad8525f50",
- "sha256:50a7ddf3d131dc5633dccdb51417e2d1910d25cbcf842115a3a5893509140a3a",
- "sha256:529801a0d58809b60b3531ee804d3e3be4b412c94b5d267daa3de7fadef00f49",
- "sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d",
- "sha256:53db2c6be8a2710b359bfd3d3aa17ba38f8aa72a82309a12ae99d3c0c3dcd74d",
- "sha256:5537f71b6d646f7f5f340562ec4c77b6e1c915f8baae822ea0b7e46c1f09b733",
- "sha256:563d5f9354e15e048465061509403f68424fef37d5add3064038c2511c8f5e00",
- "sha256:5d408a642a5484b9b4d11dea15a489ea0928c7e410c7525cd892f4d04f2f617b",
- "sha256:61600a7ca4bcf78a96a68a27c2ae9389763b5b94b63943d5158f2a377e09d29a",
- "sha256:6650f16365f1924d6014d2ea770bde8555b4a39dc9576abb95e3cd1ff0263b36",
- "sha256:666abff54e474d28ff42756d94544cdfd42e2ee97065857413b72e8a2d6a6345",
- "sha256:68a067c11463de2a37157930d8b153005085e42bcb7ad9ca562d77ba7d1404e0",
- "sha256:6e1d2cc79e8dae442b3fa4a26c5794428b98f81389af90623ffcc650ce9f6732",
- "sha256:74cbeac0451f27d4f50e6e8a8f3a52ca074b5e2da9f7b505c4201a57a8ed6286",
- "sha256:780b48456a0f0ba4d390e8b5f7c661fdd218934388cde1a974010a965e200e12",
- "sha256:788aef3549f1924d5c38263104dae7395bf020a42776d5ec5ea2b0d3d85d6646",
- "sha256:7ee1227cf08b6716c85504aebc49ac827eb88fcc6e51564f010f11a406c0a667",
- "sha256:7f301b11b9d214f83ddaf689181051e7f48905568b0c7017c04c06dfd065e244",
- "sha256:83ee89483672b11f8952b158640d0c0ff02dc43d9cb1b70c1564b49abe92ce29",
- "sha256:85bfa6a5413be0ee6c5c4a663668a2cad2cbecdee367630d097d7823041bdeec",
- "sha256:9345b6f7ee578bad8e475129ed40123d265464c4cfead6c261fd60fc9de00bcf",
- "sha256:93a5051fcf5fad72de73b96f07d30bc29665697fb8ecdfbc474f3452c78adcf4",
- "sha256:962b9a917dd7ceacbe5cd424556914cb0d636001e393b43dc886ba31d2a1e449",
- "sha256:96fc32c16ea6d60d3ca7f63397bff5c75c5a562f7db6dec7d412f7c4d2e78ec0",
- "sha256:98ba568e8ae26beb726aeea2273053c717641933836568c2a0278a84987b2a1a",
- "sha256:a3feefd5e95871872673b08636f96b61ebef62971eab044f5124fb4dea39919d",
- "sha256:a955b747d620a50408b7fdf948e04359d6e762ff8a85f5775d907ceced715129",
- "sha256:b43c2b8a330a490daaef5a47ab114935002b13b3f9dc5da56d5322ff218eeadb",
- "sha256:b483c9d00a565633c87abd0aaf27eb5016de23fed952e054ecc19ce32f6a9e7e",
- "sha256:b9ed0b1e5e0759d6b7f8e2f143894b2a7f3edd313f38cf44e1e15d360e11749b",
- "sha256:ba05430e819e58544e840a68b03b28b6d328aff2e41579037e8bab7653b37d83",
- "sha256:ca49e1ab99593438b204e00f3970e7a5f70d045267051dfa6b5f4304fcfa1dbf",
- "sha256:ca5f18a75e1256ce07494e245cdb146f5a9267d3c702ebf9b65c7f8bd843431e",
- "sha256:cd410a1cbb2d297c67d8521759ab2ee3f1d66206d2e4328502a487589a2cb21b",
- "sha256:ce298e3d0c65bd03fa65ffcc6db0e2b578e8f626d468db64fdf8457731052942",
- "sha256:d5ca078bb666c4a9d1287a379fe617a6dccd18c3e8a7e6c7e1eb8974330c626a",
- "sha256:d5fd67df77bab0d3f4ea1d7afca9ef15c2ee35dfb348c7b57ffb9782a6e4db6e",
- "sha256:da1a90c1ddb7531b1d5ff1e171b4ee61f6345119be7351104b67ff413843fe94",
- "sha256:dba70f30fd81f8ce6d32ddeef37d91c8948e5d5a4c63242d16a2b2df8143aafc",
- "sha256:dc07f021ee80510f3cd3af2cad5b6a3b3a10b057521d9e6aaeb621730d320c5a",
- "sha256:dd33eb9bdcfbabab3459c9ee651d94c842bc8a05fabc95edf4ee0c15a072495e",
- "sha256:e0538c43565ee6e703d3a7c3bdfe4037a5209250e8502c98f20fea6f5fdf2965",
- "sha256:e1f54b9b4b6c53369f40028d2dd07a8c374583417ee6ec0ea304e710a20f80a0",
- "sha256:e32d2a2b02ccbef10145df9135751abea1f9f076e67a4e261b05f24b94219e36",
- "sha256:e6096b0688e6e14af6a1b10eaad86b4ff17935c49aa774eac7c95a57a4e8c296",
- "sha256:e71255ba42567d34a13c03968736c5d39bb4a97ce98188fafb27ce981115beec",
- "sha256:ed2e07c6a26ed4bea91b897ee2b0835c21716d9a469a96c3e878dc5f8c55bb23",
- "sha256:eef2afb0fd1747f33f1ee3e209bce1ed582d1896b240ccc5e2697e3275f037c7",
- "sha256:f23222527b307970e383433daec128d769ff778d9b29343fb3496472dc20dabe",
- "sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6",
- "sha256:f5be7805e53dafe94d295399cfbe5227f39995a997f4fd8539bf3cbdc8f47ca8",
- "sha256:f7f325be2804246a75a4f45c72d4ce80d2443ab815063cdf70ee8fb2ca59ee1b",
- "sha256:f8af619e3be812a2059b212064ea7a640aff0568d972cd1b9e920837469eb3cb",
- "sha256:fa8c626d6441e2d04b6ee703ef2d1e17608ad44c7cb75258c09dd42bacdfc64b",
- "sha256:fbb9dc00e39f3e6c0ef48edee202f9520dafb233e8b51b06b8428cfcb92abd30",
- "sha256:fff55f3ce50a3ff63ec8e2a8d3dd924f1941b250b0aac3d3d42b687eeff07a8e"
- ],
- "version": "==2021.11.10"
+ "version": "==0.7.4"
},
"requests": {
"hashes": [
- "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
- "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.26.0"
+ "version": "==2.28.1"
},
"responses": {
"hashes": [
- "sha256:a2e3aca2a8277e61257cd3b1c154b1dd0d782b1ae3d38b7fa37cbe3feb531791",
- "sha256:f358ef75e8bf431b0aa203cc62625c3a1c80a600dbe9de91b944bf4e9c600b92"
+ "sha256:396acb2a13d25297789a5866b4881cf4e46ffd49cc26c43ab1117f40b973102e",
+ "sha256:dcf294d204d14c436fddcc74caefdbc5764795a40ff4e6a7740ed8ddbf3294be"
],
"index": "ia",
- "version": "==0.16.0"
+ "version": "==0.22.0"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==1.16.0"
},
+ "stack-data": {
+ "hashes": [
+ "sha256:32d2dd0376772d01b6cb9fc996f3c8b57a357089dec328ed4b6553d037eaf815",
+ "sha256:cbb2a53eb64e5785878201a97ed7c7b94883f48b87bfb0bbe8b623c74679e4a8"
+ ],
+ "version": "==0.6.2"
+ },
"toml": {
"hashes": [
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
"version": "==0.10.2"
},
"tomli": {
"hashes": [
- "sha256:c6ce0015eb38820eaf32b5db832dbc26deb3dd427bd5f6556cf0acac2c214fee",
- "sha256:f04066f68f5554911363063a30b108d2b5a5b1a010aa8b6132af78489fe3aade"
+ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
+ "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
+ ],
+ "version": "==2.0.1"
+ },
+ "tomlkit": {
+ "hashes": [
+ "sha256:07de26b0d8cfc18f871aec595fda24d95b08fef89d147caa861939f37230bf4b",
+ "sha256:71b952e5721688937fb02cf9d354dbcf0785066149d2855e44531ebdd2b65d73"
],
- "version": "==1.2.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.11.6"
},
"traitlets": {
"hashes": [
- "sha256:059f456c5a7c1c82b98c2e8c799f39c9b8128f6d0d46941ee118daace9eb70c7",
- "sha256:2d313cc50a42cd6c277e7d7dc8d4d7fedd06a2c215f78766ae7b1a66277e0033"
+ "sha256:6cc57d6dc28c85d5365961726ffd19b538739347749e13ebe34e03323a0e8f84",
+ "sha256:c864831efa0ba6576d09b44884b34e41defc18c0d7e720b4a2d6698c842cab3e"
],
- "version": "==5.1.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.8.0"
},
"types-beautifulsoup4": {
"hashes": [
- "sha256:7669f392a89b1276fcaa933d2f12e011a06b5b5c4bf88151dfcb215b89185749",
- "sha256:9e8c8d8c30e74bcfe65da58e797e7fb9411f8f90d98b6c6c6534f96250e4cd5f"
+ "sha256:c1f803367a2b07ad4fdac40ddbea557010dc4ddd1ee92d801f317eb02e2e3c72",
+ "sha256:d46be8f409ddccb6daaa9d118484185e70bcf552085c39c6d05b157cd1462e04"
],
"index": "ia",
- "version": "==4.10.5"
+ "version": "==4.11.6.1"
},
"types-dateparser": {
"hashes": [
- "sha256:93de659aca8381b33c2d8f94da67a7316d3cb91524c535b45ca23537afaebbf2",
- "sha256:a6552c9ce37fd119764349674c6526a0dc3ca5e582f86142f288bf55944a0b9f"
+ "sha256:5b0c8845167981f68f090894aa371bddbd0371341b90c3f868ac9524cd0a6b69",
+ "sha256:65232f1b3a952476fb98b31ae0a4019efd32635981040149b97b161d5ce2b4da"
],
"index": "ia",
- "version": "==1.0.8"
+ "version": "==1.1.4.4"
},
"types-pillow": {
"hashes": [
- "sha256:8dcda8883016678f6d37069e3a5500d506d7bec3acb0c5f4fedd5d9a8cb910b2",
- "sha256:ddca50f3d6e54f061b3a5d73e2123ad33b3c61ff20f0a203a82ce7c3eed824a5"
+ "sha256:98b8484ff343676f6f7051682a6cfd26896e993e86b3ce9badfa0ec8750f5405",
+ "sha256:c18d466dc18550d96b8b4a279ff94f0cbad696825b5ad55466604f1daf5709de"
],
"index": "ia",
- "version": "==8.3.8"
+ "version": "==9.3.0.4"
},
"types-psycopg2": {
"hashes": [
- "sha256:3b1230df902610bf16f9272e0654652fd7a17504f3131ed4f2a552a5643e762c",
- "sha256:8c25c1c2860d9a51bb165f7b953872d7842e1770e75483597ee919cd834bf456"
+ "sha256:084558d6bc4b2cfa249b06be0fdd9a14a69d307bae5bb5809a2f14cfbaa7a23f",
+ "sha256:bff045579642ce00b4a3c8f2e401b7f96dfaa34939f10be64b0dd3b53feca57d"
],
"index": "ia",
- "version": "==2.9.4"
+ "version": "==2.9.21.2"
},
"types-requests": {
"hashes": [
- "sha256:0893e112e1510bbb67f537941c92192de7472e51bf7f236e0e583866f0ed933e",
- "sha256:853571b3accc188976c0f4feffcaebf6cdfc170082b5e43f3358aa78de61f531"
+ "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3",
+ "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"
],
"index": "ia",
- "version": "==2.26.1"
+ "version": "==2.28.11.7"
+ },
+ "types-toml": {
+ "hashes": [
+ "sha256:171bdb3163d79a520560f24ba916a9fc9bff81659c5448a9fea89240923722be",
+ "sha256:b7b5c4977f96ab7b5ac06d8a6590d17c0bf252a96efc03b109c2711fb3e0eafd"
+ ],
+ "version": "==0.10.8.1"
+ },
+ "types-urllib3": {
+ "hashes": [
+ "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49",
+ "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"
+ ],
+ "version": "==1.26.25.4"
},
"typing-extensions": {
"hashes": [
- "sha256:4ca091dea149f945ec56afb48dae714f21e8692ef22a395223bcd328961b6a0e",
- "sha256:7f001e5ac290a0c0401508864c7ec868be4e701886d5b573a9528ed3973d9d3b"
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
],
- "version": "==4.0.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"urllib3": {
"hashes": [
- "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
- "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version != '3.4'",
- "version": "==1.26.7"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"wcwidth": {
"hashes": [
@@ -1929,59 +2004,73 @@
},
"wrapt": {
"hashes": [
- "sha256:086218a72ec7d986a3eddb7707c8c4526d677c7b35e355875a0fe2918b059179",
- "sha256:0877fe981fd76b183711d767500e6b3111378ed2043c145e21816ee589d91096",
- "sha256:0a017a667d1f7411816e4bf214646d0ad5b1da2c1ea13dec6c162736ff25a374",
- "sha256:0cb23d36ed03bf46b894cfec777eec754146d68429c30431c99ef28482b5c1df",
- "sha256:1fea9cd438686e6682271d36f3481a9f3636195578bab9ca3382e2f5f01fc185",
- "sha256:220a869982ea9023e163ba915077816ca439489de6d2c09089b219f4e11b6785",
- "sha256:25b1b1d5df495d82be1c9d2fad408f7ce5ca8a38085e2da41bb63c914baadff7",
- "sha256:2dded5496e8f1592ec27079b28b6ad2a1ef0b9296d270f77b8e4a3a796cf6909",
- "sha256:2ebdde19cd3c8cdf8df3fc165bc7827334bc4e353465048b36f7deeae8ee0918",
- "sha256:43e69ffe47e3609a6aec0fe723001c60c65305784d964f5007d5b4fb1bc6bf33",
- "sha256:46f7f3af321a573fc0c3586612db4decb7eb37172af1bc6173d81f5b66c2e068",
- "sha256:47f0a183743e7f71f29e4e21574ad3fa95676136f45b91afcf83f6a050914829",
- "sha256:498e6217523111d07cd67e87a791f5e9ee769f9241fcf8a379696e25806965af",
- "sha256:4b9c458732450ec42578b5642ac53e312092acf8c0bfce140ada5ca1ac556f79",
- "sha256:51799ca950cfee9396a87f4a1240622ac38973b6df5ef7a41e7f0b98797099ce",
- "sha256:5601f44a0f38fed36cc07db004f0eedeaadbdcec90e4e90509480e7e6060a5bc",
- "sha256:5f223101f21cfd41deec8ce3889dc59f88a59b409db028c469c9b20cfeefbe36",
- "sha256:610f5f83dd1e0ad40254c306f4764fcdc846641f120c3cf424ff57a19d5f7ade",
- "sha256:6a03d9917aee887690aa3f1747ce634e610f6db6f6b332b35c2dd89412912bca",
- "sha256:705e2af1f7be4707e49ced9153f8d72131090e52be9278b5dbb1498c749a1e32",
- "sha256:766b32c762e07e26f50d8a3468e3b4228b3736c805018e4b0ec8cc01ecd88125",
- "sha256:77416e6b17926d953b5c666a3cb718d5945df63ecf922af0ee576206d7033b5e",
- "sha256:778fd096ee96890c10ce96187c76b3e99b2da44e08c9e24d5652f356873f6709",
- "sha256:78dea98c81915bbf510eb6a3c9c24915e4660302937b9ae05a0947164248020f",
- "sha256:7dd215e4e8514004c8d810a73e342c536547038fb130205ec4bba9f5de35d45b",
- "sha256:7dde79d007cd6dfa65afe404766057c2409316135cb892be4b1c768e3f3a11cb",
- "sha256:81bd7c90d28a4b2e1df135bfbd7c23aee3050078ca6441bead44c42483f9ebfb",
- "sha256:85148f4225287b6a0665eef08a178c15097366d46b210574a658c1ff5b377489",
- "sha256:865c0b50003616f05858b22174c40ffc27a38e67359fa1495605f96125f76640",
- "sha256:87883690cae293541e08ba2da22cacaae0a092e0ed56bbba8d018cc486fbafbb",
- "sha256:8aab36778fa9bba1a8f06a4919556f9f8c7b33102bd71b3ab307bb3fecb21851",
- "sha256:8c73c1a2ec7c98d7eaded149f6d225a692caa1bd7b2401a14125446e9e90410d",
- "sha256:936503cb0a6ed28dbfa87e8fcd0a56458822144e9d11a49ccee6d9a8adb2ac44",
- "sha256:944b180f61f5e36c0634d3202ba8509b986b5fbaf57db3e94df11abee244ba13",
- "sha256:96b81ae75591a795d8c90edc0bfaab44d3d41ffc1aae4d994c5aa21d9b8e19a2",
- "sha256:981da26722bebb9247a0601e2922cedf8bb7a600e89c852d063313102de6f2cb",
- "sha256:ae9de71eb60940e58207f8e71fe113c639da42adb02fb2bcbcaccc1ccecd092b",
- "sha256:b73d4b78807bd299b38e4598b8e7bd34ed55d480160d2e7fdaabd9931afa65f9",
- "sha256:d4a5f6146cfa5c7ba0134249665acd322a70d1ea61732723c7d3e8cc0fa80755",
- "sha256:dd91006848eb55af2159375134d724032a2d1d13bcc6f81cd8d3ed9f2b8e846c",
- "sha256:e05e60ff3b2b0342153be4d1b597bbcfd8330890056b9619f4ad6b8d5c96a81a",
- "sha256:e6906d6f48437dfd80464f7d7af1740eadc572b9f7a4301e7dd3d65db285cacf",
- "sha256:e92d0d4fa68ea0c02d39f1e2f9cb5bc4b4a71e8c442207433d8db47ee79d7aa3",
- "sha256:e94b7d9deaa4cc7bac9198a58a7240aaf87fe56c6277ee25fa5b3aa1edebd229",
- "sha256:ea3e746e29d4000cd98d572f3ee2a6050a4f784bb536f4ac1f035987fc1ed83e",
- "sha256:ec7e20258ecc5174029a0f391e1b948bf2906cd64c198a9b8b281b811cbc04de",
- "sha256:ec9465dd69d5657b5d2fa6133b3e1e989ae27d29471a672416fd729b429eb554",
- "sha256:f122ccd12fdc69628786d0c947bdd9cb2733be8f800d88b5a37c57f1f1d73c10",
- "sha256:f99c0489258086308aad4ae57da9e8ecf9e1f3f30fa35d5e170b4d4896554d80",
- "sha256:f9c51d9af9abb899bd34ace878fbec8bf357b3194a10c4e8e0a25512826ef056",
- "sha256:fd76c47f20984b43d93de9a82011bb6e5f8325df6c9ed4d8310029a55fa361ea"
- ],
- "version": "==1.13.3"
+ "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3",
+ "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b",
+ "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4",
+ "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2",
+ "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656",
+ "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3",
+ "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff",
+ "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310",
+ "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a",
+ "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57",
+ "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069",
+ "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383",
+ "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe",
+ "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87",
+ "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d",
+ "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b",
+ "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907",
+ "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f",
+ "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0",
+ "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28",
+ "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1",
+ "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853",
+ "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc",
+ "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3",
+ "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3",
+ "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164",
+ "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1",
+ "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c",
+ "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1",
+ "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7",
+ "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1",
+ "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320",
+ "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed",
+ "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1",
+ "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248",
+ "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c",
+ "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456",
+ "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77",
+ "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef",
+ "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1",
+ "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7",
+ "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86",
+ "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4",
+ "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d",
+ "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d",
+ "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8",
+ "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5",
+ "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471",
+ "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00",
+ "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68",
+ "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3",
+ "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d",
+ "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735",
+ "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d",
+ "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569",
+ "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7",
+ "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59",
+ "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5",
+ "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb",
+ "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b",
+ "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f",
+ "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462",
+ "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015",
+ "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"
+ ],
+ "markers": "python_version < '3.11'",
+ "version": "==1.14.1"
}
}
}
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..4395f19
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,46 @@
+
+This directory contains `sandcrawler` python code for ingest pipelines, batch
+processing, PDF extraction, etc.
+
+
+## Development Quickstart
+
+As of December 2022, working with this code requires:
+
+- Python 3.8 (specifically, due to version specification in `pipenv`)
+- `pipenv` for python dependency management
+- generic and python-specific build tools (`pkg-config`, `python-dev`, etc)
+- poppler (PDF processing library)
+- libmagic
+- libsodium
+- access to IA internal packages (`devpi.us.archive.org`), specifically for
+ globalwayback and related packages
+
+In production and CI we use Ubuntu Focal (20.04). The CI script for this
+repository (`../.gitlab-ci.yml`) is the best place to look for a complete list
+of dependencies for both development and deployment. Note that our CI system
+runs from our cluster, which resolves the devpi access issue. For developer
+laptops, you may need `sshuttle` or something similar set up to do initial
+package pulls.
+
+It is recommended to set the env variable `PIPENV_VENV_IN_PROJECT=true` when
+working with pipenv. You can include this in a `.env` file.
+
+There is a Makefile which helps with the basics. Eg:
+
+ # install deps using pipenv
+ make deps
+
+ # run python tests
+ make test
+
+ # run code formatting and lint checks
+ make fmt lint
+
+Sometimes when developing it is helpful to enter a shell with pipenv, eg:
+
+ pipenv shell
+
+Often when developing it is helpful (or necessary) to set environment
+variables. `pipenv shell` will read from `.env`, so you can copy and edit
+`example.env`, and it will be used in tests, `pipenv shell`, etc.
diff --git a/python/TODO b/python/TODO
deleted file mode 100644
index 58a463f..0000000
--- a/python/TODO
+++ /dev/null
@@ -1,7 +0,0 @@
-
-ingest crawler:
-- SPNv2 only
- - remove most SPNv1/v2 path selection
-- landing page + fulltext hops only (short recursion depth)
-- use wayback client library instead of requests to fetch content
-- https://pypi.org/project/ratelimit/
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 029cbf1..3ffac98 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -5,7 +5,7 @@ might go to stdout, or might go to Kafka topic.
Example of large parallel run, locally:
- cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
"""
import argparse
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index ce3a59c..0b74f9f 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -2,10 +2,11 @@
import argparse
import json
+import subprocess
import sys
from http.server import HTTPServer
-import raven
+import sentry_sdk
from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink
from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
@@ -43,12 +44,6 @@ def run_single_ingest(args):
def run_requests(args):
- if args.enable_sentry:
- try:
- git_sha = raven.fetch_git_sha("..")
- except Exception:
- git_sha = None
- sentry_client = raven.Client(release=git_sha) # noqa:
# TODO: switch to using JsonLinePusher
file_worker = IngestFileWorker(
try_spn2=not args.no_spn2,
@@ -75,11 +70,11 @@ def run_file_requests_backfill(args):
Can be used to batch re-process known files.
"""
- grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
- pdftext_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env)
- thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env)
- xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.kafka_env)
- htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.kafka_env)
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
grobid_sink = KafkaSink(
kafka_hosts=args.kafka_hosts,
produce_topic=grobid_topic,
@@ -120,6 +115,20 @@ def run_file_requests_backfill(args):
pusher.run()
+def run_spn_status(args):
+ worker = IngestFileWorker(
+ sink=None,
+ try_spn2=False,
+ )
+
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/system")
+ resp.raise_for_status()
+ print(f"System status: {json.dumps(resp.json(), sort_keys=True)}")
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/user")
+ resp.raise_for_status()
+ print(f"User status: {json.dumps(resp.json(), sort_keys=True)}")
+
+
def run_api(args):
port = 8083
print("Listening on localhost:{}".format(port))
@@ -129,6 +138,12 @@ def run_api(args):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--enable-sentry",
+ action="store_true",
+ help="report exceptions to Sentry",
+ )
+ parser.add_argument("--env", default="dev", help="environment (eg, prod, qa, dev)")
subparsers = parser.add_subparsers()
sub_single = subparsers.add_parser("single", help="ingests a single base URL")
@@ -163,11 +178,6 @@ def main():
"--no-spn2", action="store_true", help="don't use live web (SPNv2)"
)
sub_requests.add_argument(
- "--enable-sentry",
- action="store_true",
- help="report exceptions to Sentry",
- )
- sub_requests.add_argument(
"--html-quick-mode",
action="store_true",
help="don't fetch individual sub-resources, just use CDX",
@@ -203,17 +213,30 @@ def main():
help="list of Kafka brokers (host/port) to use",
)
sub_file_requests_backfill.add_argument(
- "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
- )
- sub_file_requests_backfill.add_argument(
"--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
)
+ sub_spn_status = subparsers.add_parser(
+ "spn-status", help="checks save-page-now v2 API status for bot user"
+ )
+ sub_spn_status.set_defaults(func=run_spn_status)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ if args.enable_sentry:
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 9d3010e..24b749d 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -5,7 +5,7 @@ text extraction.
Example of large parallel run, locally:
-cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
"""
import argparse
diff --git a/python/pytest.ini b/python/pytest.ini
index d916b98..18e8cf0 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -19,6 +19,8 @@ filterwarnings =
ignore::DeprecationWarning:.*justext
ignore::DeprecationWarning:.*internetarchive
ignore::DeprecationWarning:.*minio
+ ignore::DeprecationWarning:.*base_reporter
+ ignore::DeprecationWarning:.*loccache
ignore:.*pytz-deprecation-shim
log_level = INFO
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 6718c57..469c2a2 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -7,6 +7,7 @@ from .ia import (
CdxRow,
PetaboxError,
ResourceResult,
+ SavePageNowBackoffError,
SavePageNowClient,
SavePageNowError,
WarcResource,
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 07d9844..5c13318 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -43,7 +43,7 @@ class FilesetPlatformHelper:
def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
assert item.manifest
- total_size = sum([m.size for m in item.manifest]) or 0
+ total_size = sum([m.size or 0 for m in item.manifest]) or 0
largest_size = max([m.size or 0 for m in item.manifest]) or 0
if len(item.manifest) == 1:
if total_size < 64 * 1024 * 1024:
@@ -375,6 +375,11 @@ class FigshareHelper(FilesetPlatformHelper):
comp = comp[2:]
if comp[0] in [
"dataset",
+ # TODO: should the following be considered "out of scope"?
+ "journal_contribution",
+ "presentation",
+ "poster",
+ "thesis",
]:
comp = comp[1:]
@@ -472,7 +477,10 @@ class FigshareHelper(FilesetPlatformHelper):
# extra=dict(),
)
)
- assert not row.get("is_link_only")
+ if row.get("is_link_only"):
+ raise PlatformScopeError(
+ f"figshare.org file is just a link (not a file): {row['name']} at {row['download_url']}"
+ )
authors = []
for author in obj["authors"]:
@@ -521,6 +529,14 @@ def test_parse_figshare_url_path() -> None:
"12127176",
"4",
),
+ "/articles/journal_contribution/Improved_Time_Resolved_Measurements_of_Inorganic_Ions_in_Particulate_Matter_by_PILS_IC_Integrated_with_a_Sample_Pre_Concentration_System/1407386/3": (
+ "1407386",
+ "3",
+ ),
+ "/articles/poster/Effect_of_nanoclay_loading_on_the_thermal_decomposition_of_nanoclay_polyurethane_elastomers_obtained_by_bulk_polymerization/1094056/1": (
+ "1094056",
+ "1",
+ ),
}
invalid = [
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 9696f3c..1d84ce5 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -158,22 +158,29 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
print(f" verifying {m.path}", file=sys.stderr)
file_meta = gen_file_metadata_path(local_path, allow_empty=True)
- assert (
- file_meta["size_bytes"] == m.size
- ), f"expected: {m.size} found: {file_meta['size_bytes']}"
+ if file_meta["size_bytes"] != m.size:
+ print(f" expected: {m.size} found: {file_meta['size_bytes']}", file=sys.stderr)
+ m.status = "mismatch-size"
+ continue
if m.sha1:
- assert file_meta["sha1hex"] == m.sha1
+ if file_meta["sha1hex"] != m.sha1:
+ m.status = "mismatch-sha1"
+ continue
else:
m.sha1 = file_meta["sha1hex"]
if m.sha256:
- assert file_meta["sha256hex"] == m.sha256
+ if file_meta["sha256hex"] != m.sha256:
+ m.status = "mismatch-sha256"
+ continue
else:
m.sha256 = file_meta["sha256hex"]
if m.md5:
- assert file_meta["md5hex"] == m.md5
+ if file_meta["md5hex"] != m.md5:
+ m.status = "mismatch-md5"
+ continue
else:
m.md5 = file_meta["md5hex"]
@@ -194,17 +201,27 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
m.mimetype = file_meta["mimetype"]
m.status = "verified-local"
+ # if verification failed for any individual files, bail out
+ for m in item.manifest:
+ if m.status != "verified-local":
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status=m.status,
+ )
+
# 2. upload all files, with metadata
assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
- item_files = []
+ item_files = {}
for m in item.manifest:
local_path = local_dir + "/" + m.path
- item_files.append(
- {
- "name": local_path,
- "remote_name": m.path,
- }
- )
+ if m.path == "name":
+ raise NotImplementedError(
+ "fileset file path is 'name', which is a reserved keyword"
+ )
+ item_files[m.path] = local_path
+ if len(item_files) != len(item.manifest):
+ raise NotImplementedError("file/manifest length mismatch: duplicated file paths?")
print(
f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
@@ -317,8 +334,16 @@ class WebFilesetStrategy(FilesetIngestStrategy):
else:
assert resource.terminal_status_code == 200
+ if not resource.body:
+ m.status = "empty-blob"
+ continue
+
file_meta = gen_file_metadata(resource.body)
- file_meta, html_resource = fix_transfer_encoding(file_meta, resource)
+ try:
+ file_meta, _html_resource = fix_transfer_encoding(file_meta, resource)
+ except Exception:
+ m.status = "transfer-encoding-error"
+ continue
if self.ingest_strategy == "web-file":
file_file_meta = file_meta
@@ -332,7 +357,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
continue
m.md5 = m.md5 or file_meta["md5hex"]
- m.sha1 = m.sha1 or file_meta["md5hex"]
+ m.sha1 = m.sha1 or file_meta["sha1hex"]
m.sha256 = m.sha256 or file_meta["sha256hex"]
m.mimetype = m.mimetype or file_meta["mimetype"]
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 1f957da..aa2c112 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -120,6 +120,15 @@ class GrobidClient(object):
"status_code": -4, # heritrix3 "HTTP timeout" code
"error_msg": "GROBID request (HTTP POST) timeout",
}
+ except requests.exceptions.ConnectionError as ce:
+ # intentionally raising this, so workers crash when GROBID
+ # unavailable. but do add a sleep to slow things down.
+ print(
+ "GROBID ConnectionError. sleeping as a slow-down before crashing",
+ file=sys.stderr,
+ )
+ time.sleep(5.0)
+ raise ce
info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
if grobid_response.status_code == 200:
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 50183be..207f067 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -38,38 +38,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
redirect: Any
### General Tricks ###
-
- # highwire-style meta tag
- meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
- if not meta:
- meta = soup.find("meta", attrs={"name": "bepress_citation_pdf_url"})
- if not meta:
- meta = soup.find("meta", attrs={"name": "wkhealth_pdf_url"})
- if not meta:
- # researchgate does this; maybe others also?
- meta = soup.find("meta", attrs={"property": "citation_pdf_url"})
- if not meta:
- meta = soup.find("meta", attrs={"name": "eprints.document_url"})
- # if tag is only partially populated
- if meta and not meta.get("content"):
- meta = None
- # wiley has a weird almost-blank page we don't want to loop on
- if meta and "://onlinelibrary.wiley.com/doi/pdf/" not in html_url:
- url = meta["content"].strip()
- if "://doi.org/" in url:
- print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
- elif url.startswith("/"):
- if host_prefix + url == html_url:
- print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
- else:
- return dict(pdf_url=host_prefix + url, technique="citation_pdf_url")
- elif url.startswith("http"):
- if url == html_url:
- print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
- else:
- return dict(pdf_url=url, technique="citation_pdf_url")
- else:
- print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+ # note: most of these have migrated to the html_biblio code path
meta = soup.find("meta", attrs={"name": "generator"})
meta_generator = None
@@ -343,6 +312,24 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
url = html_url + "pdf"
return dict(pdf_url=url, technique="jmir-url")
+ # Google Drive
+ # this is assuming it is a PDF
+ if "drive.google.com/file/d/" in html_url and "/view" in html_url:
+ gdrive_id = html_url.split("/")[5]
+ if len(gdrive_id) > 10:
+ # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24
+ return dict(
+ pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}",
+ technique="google-drive",
+ )
+
+ # https://doi.org/10.24850/j-tyca-14-4-7
+ # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150
+ if "docs.google.com/viewer?url=" in html_url:
+ original_url = html_url.split("?url=")[1]
+ if original_url:
+ return dict(pdf_url=original_url, technique="docs.google.com viewer")
+
### below here we are doing guesses
# generic guess: try current URL plus .pdf, if it exists in the HTML body
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index c46788e..1e2d197 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -207,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"technique": "SciElo XML link",
},
{
- "in_doc_url": "/article/view/",
+ "in_doc_url": "/view/",
"in_fulltext_url": "viewXML",
"selector": "a[class='obj_galley_link']",
"attr": "href",
@@ -255,6 +255,12 @@ HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"attr": "href",
"technique": "dovepress fulltext link",
},
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
]
COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
@@ -325,10 +331,10 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
},
{
- "in_doc_url": "/article/view/",
+ "in_doc_url": "/view/",
"selector": "a#pdfDownloadLink",
"attr": "href",
- "technique": "pdfDownloadLink link",
+ "technique": "OJS pdfDownloadLink link",
"example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
},
{
@@ -597,13 +603,171 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"technique": "PDF Download link (journals.uchicago.edu)",
"example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
},
+ {
+ "in_doc_url": "integrityresjournals.org",
+ "in_fulltext_url": "/article-full-text-pdf/",
+ "selector": "a[target='_blank'].btn-danger",
+ "attr": "href",
+ "technique": "PDF Download link (integrityresjournals.org)",
+ "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body.pkp_page_article a.download",
+ "attr": "href",
+ "technique": "OJS PDF Embed",
+ "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "/article/",
+ "selector": "a.pdf",
+ "attr": "href",
+ "technique": "OJS PDF link",
+ },
+ {
+ "in_doc_url": "scitemed.com/article/",
+ "in_fulltext_url": ".pdf",
+ "selector": "li.tab_pdf_btn a",
+ "attr": "href",
+ "technique": "PDF link (scitemed.com)",
+ },
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+ {
+ "in_doc_url": "/jvi.aspx",
+ "in_fulltext_url": "download_fulltext",
+ "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item",
+ "attr": "href",
+ "technique": "erciyesmedj.com publication system PDF download link",
+ },
+ {
+ "selector": "body embed[alt='pdf']",
+ "attr": "src",
+ "technique": "embed PDF",
+ "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913",
+ },
+ {
+ "in_fulltext_url": "viewPDFInterstitial",
+ "in_doc_url": "/view/",
+ "selector": "frameset frame",
+ "attr": "src",
+ "technique": "PDF iframe (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ # note this one has a special handler
+ "in_doc_url": "viewPDFInterstitial",
+ "in_fulltext_url": "://",
+ "selector": "head meta[http-equiv='refresh']",
+ "attr": "content",
+ "technique": "HTML meta refresh (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ "in_doc_url": "dlib.si/details/",
+ "in_fulltext_url": "PDF",
+ "selector": "body #FilesBox a",
+ "attr": "href",
+ "technique": "dlib.si download links",
+ "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ",
+ },
+ {
+ "in_doc_url": "filclass.ru",
+ "in_fulltext_url": "pdf",
+ "selector": "main .pdf-article a.pdficon",
+ "attr": "href",
+ "technique": "filclass.ru PDF link",
+ "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
+ },
+ {
+ "in_doc_url": "cdnsciencepub.com",
+ "in_fulltext_url": "pdf",
+ "selector": "article .info-panel a.btn--pdf",
+ "attr": "href",
+ "technique": "cdnsciencepub.com PDF link",
+ "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011",
+ },
+ {
+ "in_doc_url": "grrjournal.com",
+ "in_fulltext_url": "pdf",
+ "selector": ".ereaders-main-section a[download]",
+ "attr": "href",
+ "technique": "grrjournal.com PDF link",
+ "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "pdf",
+ "selector": "#articleFullText a.remote_pdf",
+ "attr": "href",
+ "technique": "OJS remote_pdf link",
+ "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/abs/",
+ "in_fulltext_url": "/reader/",
+ "selector": "article.container .single__download a",
+ "attr": "href",
+ "technique": "worldscientific landing pages",
+ "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/",
+ "in_fulltext_url": "/pdf/",
+ "selector": "noscript a[target='_blank']",
+ "attr": "href",
+ "technique": "worldscientific reader",
+ "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": ".container .view-content .download-article a",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": "body a.download-pdf",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/view/",
+ "selector": "body .entry_details a.pdf",
+ "attr": "href",
+ "technique": "generic OJS/preprints",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body header a.download",
+ "attr": "href",
+ "technique": "generic OJS/preprints PDF Embed",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
# wiley has a weird almost-blank page we don't want to loop on
- "://onlinelibrary.wiley.com/doi/pdf/"
- "://doi.org/"
- "://dx.doi.org/"
+ "://onlinelibrary.wiley.com/doi/pdf/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "{'embed': '",
+]
+
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+ "javascript:",
+ "about:",
]
RELEASE_TYPE_MAP: Dict[str, str] = {
@@ -676,6 +840,9 @@ def html_extract_fulltext_url(
val = None
if "attr" in pattern:
val = elem.attrs.get(pattern["attr"])
+ # handle HTML redirect
+ if val and pattern["attr"] == "content" and "URL=" in val:
+ val = val.split("URL=")[1]
elif pattern.get("use_body"):
val = elem.text()
if "://" not in val:
@@ -687,13 +854,28 @@ def html_extract_fulltext_url(
if "in_fulltext_url" in pattern:
if pattern["in_fulltext_url"] not in val:
continue
+ skip_matched = False
for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
if skip_pattern in val.lower():
- continue
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+ if val.lower().startswith(skip_pattern):
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
if url_fuzzy_equal(doc_url, val):
# don't link to self, unless no other options
self_doc_url = (val, pattern.get("technique", "unknown"))
continue
+
+ # quirks modes / hacks
+ if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"):
+ val = val[:-1]
+
return (val, pattern.get("technique", "unknown"))
if self_doc_url:
print(" WARN: returning fulltext URL pointing to self", file=sys.stderr)
@@ -795,6 +977,9 @@ def load_adblock_rules() -> braveblock.Adblocker:
"||pbs.twimg.com^",
"||badge.dimensions.ai^",
"||recaptcha.net^",
+ "||tag.imagino.com^",
+ "||consent.cookiebot.com^",
+ "||recaptcha.net^",
# not sure about these CC badges (usually via a redirect)
# "||licensebuttons.net^",
# "||i.creativecommons.org^",
@@ -808,6 +993,8 @@ def load_adblock_rules() -> braveblock.Adblocker:
"js/_getUACode.js"
# PLOS images
"/resource/img/icon.*.16.png^",
+ # CAIRN broken tracking tag
+ "cairn-int.info//about.php?cairn_guest=",
],
)
@@ -824,12 +1011,19 @@ def _extract_generic(
url = node.attrs.get(attr)
# special-case a couple meta URI prefixes which don't match with adblock rules
skip = False
- for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:"]:
+ for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]:
if url and url.startswith(prefix):
skip = True
break
+ if url and "/" not in url and "." not in url and " " in url:
+ # eg: "Ce fichier n'existe pas"
+ skip = True
if skip:
continue
+ if url and url.startswith("https://https://"):
+ url = url[8:]
+ elif url and url.startswith("http://http://"):
+ url = url[7:]
if url:
# print(url, file=sys.stderr)
resources.append(dict(url=url.strip(), type=type_name))
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index dc9aae5..3ab4971 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -136,6 +136,8 @@ def fuzzy_match_url(left: str, right: str) -> bool:
return True
if left == right + "/" or right == left + "/":
return True
+ if left.replace("//", "/") == right.replace("//", "/"):
+ return True
return False
@@ -147,6 +149,13 @@ def test_fuzzy_match_url() -> None:
assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
+ assert (
+ fuzzy_match_url(
+ "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png",
+ "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png",
+ )
+ is True
+ )
# should probably handle these?
assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False
@@ -202,10 +211,19 @@ class CdxApiClient:
else:
status_code = int(raw[4])
- # CDX rows with no WARC records?
+ # remove CDX rows with no WARC records (?)
if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":
continue
+ # remove CDX rows with SHA256 (not SHA1) digests
+ if raw[5].startswith("sha-256"):
+ continue
+
+ # remove CDX rows with 'error' digests
+ # TODO: follow-up on this (2022-11-01 sandcrawler errors)
+ if raw[5].lower() == "error":
+ continue
+
row = CdxRow(
surt=raw[0],
datetime=raw[1],
@@ -316,7 +334,7 @@ class CdxApiClient:
params: Dict[str, str] = {
"url": url,
"matchType": "exact",
- "limit": "-25",
+ "limit": "-40",
"output": "json",
# Collapsing seems efficient, but is complex; would need to include
# other filters and status code in filter
@@ -327,11 +345,14 @@ class CdxApiClient:
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day)
+ closest_dt = "00000000"
if closest:
if isinstance(closest, datetime.datetime):
- params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ params["closest"] = closest_dt
else:
- params["closest"] = closest
+ closest_dt = closest
+ params["closest"] = closest_dt
params["sort"] = "closest"
# print(params, file=sys.stderr)
rows = self._query_api(params)
@@ -345,13 +366,15 @@ class CdxApiClient:
*reverse* order.
"""
return (
+ int(r.url == url),
int(r.status_code in (200, 226)),
int(0 - (r.status_code or 999)),
int(r.mimetype == best_mimetype),
int(r.mimetype != "warc/revisit"),
- int(r.datetime[:6]),
- int("/" in r.warc_path),
+ r.datetime[:4] == closest_dt[:4],
int(r.datetime),
+ # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime
+ int("/" in r.warc_path),
)
rows = sorted(rows, key=_cdx_sort_key)
@@ -396,6 +419,9 @@ class WaybackClient:
"User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient",
}
self.http_session = requests_retry_session()
+ self.record_http_session = requests_retry_session(
+ status_forcelist=[],
+ )
def fetch_petabox(
self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True
@@ -604,13 +630,15 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = self.http_session.get(
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except requests.exceptions.ConnectionError:
+ raise WaybackContentError("ConnectionError (wayback replay fetch)")
except requests.exceptions.ChunkedEncodingError:
raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
except UnicodeDecodeError:
@@ -620,14 +648,14 @@ class WaybackClient:
)
)
- try:
- resp.raise_for_status()
- except Exception as e:
- raise WaybackError(str(e))
- # print(resp.url, file=sys.stderr)
-
# defensively check that this is actually correct replay based on headers
if "X-Archive-Src" not in resp.headers:
+ # check if this was an error first
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ # otherwise, a weird case (200/redirect but no Src header
raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
if datetime not in resp.url:
raise WaybackError(
@@ -671,11 +699,18 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = self.http_session.get(
+ # when fetching via `id_`, it is possible to get a 5xx error which
+ # is either a wayback error, or an actual replay of an upstream 5xx
+ # error. the exception control flow here is tweaked, and a
+ # different HTTP session is used, to try and differentiate between
+ # the two cases
+ resp = None
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
+ resp.raise_for_status()
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
except UnicodeDecodeError:
@@ -684,15 +719,19 @@ class WaybackClient:
url
)
)
- try:
- resp.raise_for_status()
except Exception as e:
+ if resp is not None and "X-Archive-Src" in resp.headers:
+ raise WaybackContentError(
+ f"expected redirect record but got captured HTTP status: {resp.status_code}"
+ )
raise WaybackError(str(e))
- # print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
# previously check for "X-Archive-Redirect-Reason" here
- if "X-Archive-Src" not in resp.headers:
+ if (
+ "X-Archive-Src" not in resp.headers
+ and "X-Archive-Redirect-Reason" not in resp.headers
+ ):
raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
if datetime not in resp.url:
raise WaybackError(
@@ -931,7 +970,9 @@ class SavePageNowClient:
self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY"))
self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY"))
self.v2endpoint = v2endpoint
- self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
+ self.v2_session = requests_retry_session(
+ retries=5, backoff_factor=3, status_forcelist=[502, 504]
+ )
self.v2_session.headers.update(
{
"User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient",
@@ -1010,20 +1051,46 @@ class SavePageNowClient:
if domain in request_url:
force_simple_get = 1
break
- resp = self.v2_session.post(
- self.v2endpoint,
- data={
- "url": request_url,
- "capture_all": 1,
- "capture_outlinks": capture_outlinks,
- "capture_screenshot": 0,
- "if_not_archived_within": "1d",
- "force_get": force_simple_get,
- "skip_first_archive": 1,
- "outlinks_availability": 0,
- "js_behavior_timeout": 0,
- },
- )
+
+ # check if SPNv2 user has capacity available
+ resp = self.v2_session.get(f"{self.v2endpoint}/status/user")
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError(
+ f"SPNv2 availability API status_code: {resp.status_code}"
+ )
+ elif resp.status_code != 200:
+ raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}")
+ resp.raise_for_status()
+ status_user = resp.json()
+ if status_user["available"] <= 1:
+ print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr)
+ raise SavePageNowBackoffError(
+ "SPNv2 availability: {}, url: {}".format(status_user, request_url)
+ )
+
+ req_data = {
+ "url": request_url,
+ "capture_all": 1,
+ "if_not_archived_within": "1d",
+ "skip_first_archive": 1,
+ "js_behavior_timeout": 0,
+ # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API
+ # implementation
+ # "capture_screenshot": 0,
+ # "outlinks_availability": 0,
+ }
+ if force_simple_get:
+ req_data["force_get"] = force_simple_get
+ if capture_outlinks:
+ req_data["capture_outlinks"] = capture_outlinks
+ try:
+ resp = self.v2_session.post(
+ self.v2endpoint,
+ data=req_data,
+ )
+ except requests.exceptions.ConnectionError:
+ raise SavePageNowError(f"SPN2 TCP connection error {request_url=}")
+
if resp.status_code == 429:
raise SavePageNowBackoffError(
"status_code: {}, url: {}".format(resp.status_code, request_url)
@@ -1032,6 +1099,7 @@ class SavePageNowClient:
raise SavePageNowError(
"SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)
)
+ resp.raise_for_status()
resp_json = resp.json()
if (
@@ -1040,6 +1108,30 @@ class SavePageNowClient:
and "You have already reached the limit of active sessions" in resp_json["message"]
):
raise SavePageNowBackoffError(resp_json["message"])
+ elif (
+ resp_json
+ and "message" in resp_json
+ and "The same snapshot had been made" in resp_json["message"]
+ ):
+ return SavePageNowResult(
+ False,
+ "spn2-recent-capture",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif resp_json.get("status") == "error":
+ return SavePageNowResult(
+ False,
+ resp_json.get("status_ext") or resp_json["status"],
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]:
raise SavePageNowError(
"Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)
@@ -1047,6 +1139,7 @@ class SavePageNowClient:
job_id = resp_json["job_id"]
print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
+ time.sleep(0.1)
# poll until complete
final_json = None
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index d0c3e0e..03277f8 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -20,6 +20,7 @@ from sandcrawler.ia import (
NoCaptureError,
PetaboxError,
ResourceResult,
+ SavePageNowBackoffError,
SavePageNowClient,
SavePageNowError,
WaybackClient,
@@ -103,7 +104,7 @@ class IngestFileWorker(SandcrawlerWorker):
self.pdftext_sink = kwargs.get("pdftext_sink")
self.xmldoc_sink = kwargs.get("xmldoc_sink")
self.htmlteixml_sink = kwargs.get("htmlteixml_sink")
- self.max_hops = 6
+ self.max_hops = 8
self.try_existing_ingest = kwargs.get("try_existing_ingest", False)
self.try_existing_grobid = kwargs.get("try_existing_grobid", True)
@@ -115,8 +116,11 @@ class IngestFileWorker(SandcrawlerWorker):
self.max_html_resources = 200
self.base_url_blocklist = [
- # robot blocking
+ "://localhost/",
+ "://127.0.0.1/",
+ # robot blocking / rate-limited
"://hkvalidate.perfdrive.com/",
+ "://ieeexplore.ieee.org/",
# temporary, until we implement specific fetch and 'petabox' output
"://archive.org/",
"://www.archive.org/",
@@ -125,8 +129,8 @@ class IngestFileWorker(SandcrawlerWorker):
"://openlibrary.org/",
"://www.openlibrary.org/",
"://fatcat.wiki/",
+ "://scholar.archive.org/",
"://orcid.org/",
- "://doaj.org/",
# Domain squats
"://bartandjones.com",
"://ijretm.com",
@@ -150,8 +154,11 @@ class IngestFileWorker(SandcrawlerWorker):
"doi.org/10.2307/", # JSTOR; slow and many redirects
"doi.org/10.18730/", # fao.org: database entry
"doi.org/10.15468/", # gbif.org: database entry
+ "doi.org/10.48550/", # arxiv.org: redundant with direct ingest
# deprecated domain (doesn't redirect correctly)
"://edoc.mpg.de/",
+ # bogus/spam PDFs
+ "://isiarticles.com/",
]
self.wall_blocklist = [
@@ -163,12 +170,18 @@ class IngestFileWorker(SandcrawlerWorker):
"/password-login",
"://gateway.isiknowledge.com/",
"/login?TARGET=",
+ "jstage.jst.go.jp/sblogin",
+ "://acw.elsevier.com/SSOCore",
+ "://acw.sciencedirect.com/SSOCore",
+ "/login?source=",
]
self.cookie_blocklist = [
"/cookieAbsent",
"cookieSet=1",
"error=cookies_not_supported",
+ # SPNv2 seems to work (not end up here), but heritrix fails
+ "://secure.jbs.elsevierhealth.com/",
]
self.src_valid_mimetypes = [
@@ -445,7 +458,10 @@ class IngestFileWorker(SandcrawlerWorker):
return dict(status="html-selectolax-error")
html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
assert html_biblio
- html_body = html_extract_body_teixml(resource.body)
+ try:
+ html_body = html_extract_body_teixml(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="html-teixml-error")
html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
html_scope = html_guess_scope(
resource.terminal_url, html_doc, html_biblio, html_body.get("word_count")
@@ -610,7 +626,7 @@ class IngestFileWorker(SandcrawlerWorker):
result["status"] = "skip-url-blocklist"
return result
- # check against known loginwall URLs
+ # also check against known loginwall patterns
for block in self.wall_blocklist:
if block in next_url:
# TODO: blocked-wall instead of skip-wall
@@ -632,6 +648,12 @@ class IngestFileWorker(SandcrawlerWorker):
result["status"] = "spn2-error"
result["error_message"] = str(e)[:1600]
return result
+ except SavePageNowBackoffError as e:
+ result["status"] = "spn2-backoff"
+ result["error_message"] = str(e)[:1600]
+ # small sleep as a slow-down
+ time.sleep(2.0)
+ return result
except PetaboxError as e:
result["status"] = "petabox-error"
result["error_message"] = str(e)[:1600]
@@ -683,7 +705,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if not resource.body:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
if len(resource.body) > MAX_BODY_SIZE_BYTES:
@@ -699,7 +721,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if not resource.body or file_meta["size_bytes"] == 0:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
# here we split based on ingest type to try and extract a next hop
@@ -737,6 +759,12 @@ class IngestFileWorker(SandcrawlerWorker):
result["extract_next_hop"] = fulltext_url
if not fulltext_url:
+ # check if we hit a paywall/loginwall
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+ # else, just failed to find link
result["status"] = "no-pdf-link"
return result
next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or ""
@@ -816,6 +844,12 @@ class IngestFileWorker(SandcrawlerWorker):
if resource.revisit_cdx:
result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+ # check if we hit a paywall/loginwall before trying mimetype
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+
if ingest_type == "pdf":
if file_meta["mimetype"] != "application/pdf":
result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 542dfbc..3acbece 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -146,11 +146,10 @@ class IngestFilesetWorker(IngestFileWorker):
result["status"] = "wayback-content-error"
result["error_message"] = str(e)[:1600]
return result
- except NotImplementedError:
- # result['status'] = 'not-implemented'
- # result['error_message'] = str(e)[:1600]
- # return result
- resource = None
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
html_biblio = None
if resource:
@@ -180,7 +179,7 @@ class IngestFilesetWorker(IngestFileWorker):
return result
if not resource.body:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
if len(resource.body) > MAX_BODY_SIZE_BYTES:
@@ -196,7 +195,7 @@ class IngestFilesetWorker(IngestFileWorker):
return result
if not resource.body or file_meta["size_bytes"] == 0:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
# here we split based on ingest type to try and extract a next hop
@@ -256,7 +255,7 @@ class IngestFilesetWorker(IngestFileWorker):
result["status"] = "wrong-mimetype"
return result
else:
- # raise NotImplementedError()
+ # eg, datasets, components, etc
pass
result["_html_biblio"] = html_biblio
@@ -378,7 +377,30 @@ class IngestFilesetWorker(IngestFileWorker):
return result
# 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
- archive_result = strategy_helper.process(dataset_meta)
+ try:
+ archive_result = strategy_helper.process(dataset_meta)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
# 4. Summarize status and return structured result metadata.
result["status"] = archive_result.status
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 0ff7fe0..fb42e71 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -196,10 +196,20 @@ def fetch_html_resources(
wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest)
if not wayback_resp or wayback_resp.status != "success":
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
- file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
- if file_meta["sha1hex"] != wayback_resp.cdx.sha1hex:
+ # for HTML sub-resources specifically, we allow the CDX SHA1 to match
+ # either the transfer-encoded or inner (un-encoded) payload body to
+ # match. This is because of an ambiguity in the WARC specification
+ outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
+ try:
+ file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp)
+ except Exception as e:
+ raise WaybackContentError(f"bad gzip encoding: {e}")
+ if (
+ file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ ):
raise WaybackContentError(
- f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}"
+ f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url} found:{file_meta['sha1hex']} expected:{wayback_resp.cdx.sha1hex}"
)
full.append(
WebResource(
@@ -250,6 +260,8 @@ def html_guess_platform(
in doc.html
):
return "ojs"
+ if '<a href="https://www.pubpub.org">Published with' in doc.html:
+ return "pubpub"
if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
return "arpha"
if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html:
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index d47ab89..8836515 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -99,7 +99,7 @@ class SandcrawlerMinioClient(object):
sha1hex: str,
extension: str = "",
prefix: str = "",
- bucket: str = None,
+ bucket: Optional[str] = None,
) -> bytes:
"""
sha1hex is sha1 of the blob itself
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index db001dd..4e37036 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -279,9 +279,9 @@ def test_datetime_to_cdx() -> None:
def requests_retry_session(
retries: int = 10,
- backoff_factor: int = 3,
+ backoff_factor: int = 1,
status_forcelist: List[int] = [500, 502, 504],
- session: requests.Session = None,
+ session: Optional[requests.Session] = None,
) -> requests.Session:
"""
From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 6c18395..97d338e 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -69,35 +69,45 @@ BAD_PDF_SHA1HEX: List[str] = [
"58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff",
"59cfc843ebdb1c1e5db1efc76a40f46cb3bb06f0",
"5ab98405b676ee81a6ca74fba51a9e4a6cff7311",
+ "5c5b45c85eff07d4302844e00ec8baa57b988c60",
"5e04779cbbae5ce88bb786064f756885dd6895fe",
"5e6a3adde9f08c276c4efd72bfacb256f2ec35d9",
+ "62247fe6b8d3ca50477cafddbe24bf63832d6674",
"623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac",
"646c4a654270606256397684204ff0f3d17be2e7",
"64d821d728f9a3dc944b4c03be00feea0b57e314",
+ "668b7d777203af4b261d21bf4669fc9b385062e1",
"689b5cb3ddef213d612363a903f10d0358ea64d2",
"6909f0b62d8b7835de3dec7777aad7f8ef507ee3",
"74e617dc95555e8ca3aadd19d0c85b71cd77d1d9",
+ "7596438d77444a7c4228bb96fa4b394ba7d7e23b",
"75c2662a96ccc48891228df7c85eb7d4da9dd621",
"771f1ca0007a6fbed5b4a434c73f524f715d33c1",
"776859635e9dc01d97b0582f49c814ffbcb019fb",
"781dafda896a9f5c30f3d0a011f79a3b79b574c4",
"788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb",
"79d6cba3c6e577a0f3a3a9fe575680d38454938d",
+ "7b8b7e8e4b789579a7d2fda329db52528383a652",
+ "7c5c925cfb7c5a861b5c0a1d923308f9bedd335e",
"7cfc0739be9c49d94272110a0a748256bdde9be6",
"7daf61526ec825151f384cc1db510ca5237d5d80",
"7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76",
+ "800e47a7ed214f7acac85cc29aa7b0f9c0e218ae",
"8398b211a5ec4da1195a4ba1bc29ca8c0ac40f67",
"859d7ec532a0bf3b52b17c7f2d8ecc58410c0aad",
"88edcbab1cac2d70af5870422974afc253f4f0c6",
"89860fc475fcb2a2d86c4544df52ec8fd5e6533f",
"8dcaf4ef132900dd378f7be526c884b17452713b",
"8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
+ "8ec1a17ec19ae8ade95b9bdc837236981e83fffb",
"949dfb7d833da9576b2ccb9eb1ab5457469c53d3",
"961ec451172f373f919c593737466300e42062cb",
"976989fa6e447578d9ce16ec5b526f0e09d6df50",
+ "977f23723027d7052df9b49eb467e6c0b9af93ff",
"98b02eb70066c182c705ef4d14d8b723ad7f1fab",
"993ca31f6974f8387bb18dd7d38987d290da8781",
"9dbd05af3442e6f42d67868054751b76973f4171",
+ "a1cc781c694a48e018f4de110b58f561aa212051",
"a2298c137b9c8c8975bad62eea9224edb95e6952",
"a2671738755ab8b24775e95375dc72f1ca4e5fd6",
"a26f299fb97c646effeebd4c5e2968786bd0f781",
@@ -106,6 +116,7 @@ BAD_PDF_SHA1HEX: List[str] = [
"a69665d0b5d3b95f54f68406eee3ed50c67efb45",
"a8357c31837404f9ebd798999d546c9398ab3648",
"a9162b9aef5e5da0897275fede1a6cff8cc93dfc",
+ "abc9d264df446707b40d7c9f79befd0f89291e59",
"ad038725bf6855a79f3c768ebe93c7103d14522f",
"aef581bf42e76e527f5aed3b8958fd4e7a24819f",
"b2b66b9c7f817a20144456f99c0be805602e8597",
@@ -116,9 +127,11 @@ BAD_PDF_SHA1HEX: List[str] = [
"b8b427e5b3d650ba9e03197f9c3917e25b878930",
"bad48b89b639b5b7df2c6a2d5288181fcb8b0e35",
"be0cda7642e9247b3ee41cd2017fa709aab4f344",
+ "beff1b0c24aa99989be73c66dfb1d1e7578e370b",
"c1b583fbd052572f08158d39ffe4d7510dadbebb",
"c2526f75a013dc67b14ce1e2d0e4fc80bb93c6e1",
"c4abbb284f4acaca9e8ceb88f842901984e84d33",
+ "c58e028269c8dfd3a442f6745c81b4c0e8610c43",
"c7220d1bf1e71fb755d9f26bbdd4c539dc162960",
"c7687fa6f637c7d32a25be0e772867d87536d35c",
"c7d8b37ec99cf0d987e60667f05299f200e18a5d",
@@ -131,7 +144,9 @@ BAD_PDF_SHA1HEX: List[str] = [
"d188762a7e3ab5d4ee8a897204316513e4e636ec",
"d613b9e4442f5d5d19ea6814fa9729bff7da7c85",
"d6b0f405bf13c23d0e90c54eea527442786d1cd3",
+ "d91d3830bf455e6dd782eee46218e35d29f07dfd",
"da2211ee2dbc6dda36571976d810e2366a3d2504",
+ "dbb3093a797e0ae83d39eb7b235ff85a17fd965c",
"e01bb7256d77aea258313bb410dfcfc10512f420",
"e2bf5d0a5885359381fe8ef2cd9290171d494e9b",
"e2c3b8a2cf33d5e8972bc9ddb78373766a75e412",
@@ -142,6 +157,7 @@ BAD_PDF_SHA1HEX: List[str] = [
"eaf84b2efd2f69c7b3f407f89ea66ac4c41fac36",
"eb1b39fd7a874896688855a22efddef10272427c",
"eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2",
+ "ecc4b927c5e84e145c610876931bc261ae13769b",
"edf8dcc8736f06afbaca0e01d60bd2c475403a3d",
"ee2ee6ae2cf05128810d0d95bbe69bd263e140de",
"ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7",
@@ -150,6 +166,7 @@ BAD_PDF_SHA1HEX: List[str] = [
"f0ea221d8587cede25592266486e119d277f7096",
"f68f9a9202a75d2aee35252e104d796f9515001e",
"f9314d3bf2eac78a7d78d18adcccdb35542054ef",
+ "f932ef936021a3b00842b481478c40868b9a007c",
"fd9bd560662e070b222d63052830837829c490f0",
]
@@ -324,7 +341,8 @@ def process_pdf(
)
# this call sometimes fails an returns an AttributeError
page0rect = page0.page_rect()
- except (AttributeError, poppler.document.LockedDocumentError) as e:
+ # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch
+ except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e:
# may need to expand the set of exceptions caught here over time, but
# starting with a narrow set
return PdfExtractResult(
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 1119211..112df6a 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -82,7 +82,7 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
self.pdftrio_client = pdftrio_client
self.sink = sink
- def process(self, record: Any, key: str = None) -> Any:
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
start_process = time.time()
fetch_sec = None
@@ -126,7 +126,7 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
self.sink = sink
self.mode = mode
- def process(self, blob: Any, key: str = None) -> Any:
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
start_process = time.time()
if not blob:
return None
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index d753380..f682572 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -101,6 +101,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if raw["ingest_type"] not in ("pdf", "xml", "html"):
self.counts["skip-ingest-type"] += 1
return None
+ # limit on base_url length
+ if len(raw["base_url"]) > 1500:
+ self.counts["skip-url-too-long"] += 1
+ return None
request = {
"ingest_type": raw["ingest_type"],
"base_url": raw["base_url"],
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 597a0ac..356f050 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -108,7 +108,7 @@ class SandcrawlerWorker(object):
"""
return True
- def process(self, task: Any, key: str = None) -> Any:
+ def process(self, task: Any, key: Optional[str] = None) -> Any:
"""
Derived workers need to implement business logic here.
@@ -477,7 +477,7 @@ class ZipfilePusher(RecordPusher):
self.counts["total"] += 1
# NB doesn't really extract the file, just gives you a stream (file-like-object) for reading it
flo = archive.open(zipinfo, "r")
- data = flo.read(2 ** 32)
+ data = flo.read(2**32)
flo.close()
if self.batch_size:
batch.append(data)
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 482dc33..aebcbe1 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -7,9 +7,10 @@ or S3 (SeaweedFS).
import argparse
import os
+import subprocess
import sys
-import raven
+import sentry_sdk
from sandcrawler import *
from sandcrawler.persist import (
@@ -18,13 +19,6 @@ from sandcrawler.persist import (
PersistXmlDocWorker,
)
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-try:
- git_sha = raven.fetch_git_sha("..")
-except Exception:
- git_sha = None
-sentry_client = raven.Client(release=git_sha)
-
def run_grobid_extract(args):
consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env)
@@ -278,8 +272,8 @@ def run_ingest_file(args):
pdftext_sink=pdftext_sink,
xmldoc_sink=xmldoc_sink,
htmlteixml_sink=htmlteixml_sink,
- # don't SPNv2 for --bulk backfill
- try_spn2=not args.bulk,
+ # don't SPNv2 for --bulk or --skip-spn
+ try_spn2=not (args.bulk or args.skip_spn),
spn_cdx_retry_sec=spn_cdx_retry_sec,
)
pusher = KafkaJsonPusher(
@@ -448,6 +442,11 @@ def main():
help="consume from bulk kafka topic (eg, for ingest backfill)",
)
sub_ingest_file.add_argument(
+ "--skip-spn",
+ action="store_true",
+ help="don't do SPN lookups",
+ )
+ sub_ingest_file.add_argument(
"--priority",
action="store_true",
help="consume from priority kafka topic (eg, for SPN requests)",
@@ -479,6 +478,16 @@ def main():
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 3c769cf..27ccf21 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -19,7 +19,7 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
"""
@@ -32,10 +32,7 @@ import sys
from collections import Counter
import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+import sentry_sdk
def b32_hex(s):
@@ -88,7 +85,6 @@ class DeliverDumpGrobidS3:
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
def main():
parser = argparse.ArgumentParser()
@@ -115,6 +111,8 @@ def main():
)
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index fcaf51f..093f32a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -16,14 +16,11 @@ import sys
from collections import Counter
from http.client import IncompleteRead
-import raven
+import sentry_sdk
import wayback.exception
from gwb.loader import CDXLoaderFactory
from wayback.resourcestore import ResourceStore
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
-
class DeliverGwbDisk:
def __init__(self, disk_dir, **kwargs):
@@ -161,7 +158,6 @@ class DeliverGwbDisk:
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
def main():
parser = argparse.ArgumentParser()
@@ -191,6 +187,8 @@ def main():
)
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbDisk(**args.__dict__)
worker.run(args.manifest_file)
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 1f08c4f..6f37ede 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -24,7 +24,7 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
- wayback/GWB libraries
"""
@@ -43,14 +43,11 @@ from collections import Counter
from http.client import IncompleteRead
import boto3
-import raven
+import sentry_sdk
import wayback.exception
from gwb.loader import CDXLoaderFactory
from wayback.resourcestore import ResourceStore
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
-
class DeliverGwbS3:
def __init__(self, s3_bucket, **kwargs):
@@ -179,7 +176,6 @@ class DeliverGwbS3:
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
def main():
parser = argparse.ArgumentParser()
@@ -206,6 +202,8 @@ def main():
)
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index 67286b9..aef5c12 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -23,6 +23,7 @@ DOMAIN_BLOCKLIST = [
"ncbi.nlm.nih.gov/",
# "semanticscholar.org/",
"://doi.org/",
+ "://dx.doi.org/",
"zenodo.org/",
"figshare.com/",
"://archive.org/",
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index d52e793..8a353ca 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -33,13 +33,20 @@ def run(args):
req = transform(json.loads(l))
except:
print(l, file=sys.stderr)
+ if args.force_recrawl:
+ req["force_recrawl"] = True
print(json.dumps(req, sort_keys=True))
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
- "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+ "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
+ )
+ parser.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="whether to add recrawl (SPNv2) flag to request",
)
subparsers = parser.add_subparsers()
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 9607b85..97c38f9 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -25,10 +25,40 @@ DOMAIN_BLOCKLIST = [
"://archive.org/",
".archive.org/",
"://127.0.0.1/",
+ "://www.kb.dk/",
+ "://kb-images.kb.dk/",
+ "://mdz-nbn-resolving.de/",
+ "://aggr.ukm.um.si/",
+ "://edoc.mpg.de/",
+ "doaj.org/",
+ "orcid.org/",
+ "://gateway.isiknowledge.com/",
# OAI specific additions
"://hdl.handle.net/",
]
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+ "oai:kb.dk:",
+ "oai:bdr.oai.bsb-muenchen.de:",
+ "oai:hispana.mcu.es:",
+ "oai:bnf.fr:",
+ "oai:ukm.si:",
+ "oai:biodiversitylibrary.org:",
+ "oai:hsp.org:",
+ "oai:repec:",
+ "oai:n/a:",
+ "oai:quod.lib.umich.edu:",
+ "oai:americanae.aecid.es:",
+ "oai:www.irgrid.ac.cn:",
+ "oai:espace.library.uq.edu:",
+ "oai:edoc.mpg.de:",
+ "oai:bibliotecadigital.jcyl.es:",
+ "oai:repository.erciyes.edu.tr:",
+ "oai:krm.or.kr:",
+ "oai:hypotheses.org:%",
+]
+
RELEASE_STAGE_MAP = {
"info:eu-repo/semantics/draftVersion": "draft",
"info:eu-repo/semantics/submittedVersion": "submitted",
@@ -55,6 +85,11 @@ def transform(obj):
if not obj.get("urls"):
return []
+ oai_id = obj["oai"].lower()
+ for prefix in OAI_BLOCKLIST:
+ if oai_id.startswith(prefix):
+ return []
+
# look in obj['formats'] for PDF?
if obj.get("formats"):
# if there is a list of formats, and it does not contain PDF, then
@@ -97,16 +132,17 @@ def transform(obj):
"base_url": base_url,
"ingest_type": "pdf",
"link_source": "oai",
- "link_source_id": obj["oai"].lower(),
+ "link_source_id": oai_id,
"ingest_request_source": "metha-bulk",
"release_stage": release_stage,
"rel": rel,
"ext_ids": {
- "doi": doi,
"oai": obj["oai"].lower(),
},
"edit_extra": {},
}
+ if doi:
+ request["ext_ids"]["doi"] = doi
requests.append(request)
return requests
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index ad5353b..cb64a1a 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -15,12 +15,9 @@ DOMAIN_BLOCKLIST = [
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- "semanticscholar.org/",
"://doi.org/",
"zenodo.org/",
"figshare.com/",
- "://archive.org/",
- ".archive.org/",
]
RELEASE_STAGE_MAP = {
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 614b802..043c63d 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -5,28 +5,3 @@ def test_extract_fulltext_url():
resp = extract_fulltext_url("asdf", b"asdf")
assert resp == {}
-
- resp = extract_fulltext_url(
- "http://dummy-site/",
- b"""<html>
- <head>
- <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
- </head>
- <body>
- <h1>my big article here</h1>
- blah
- </body>
- </html>""",
- )
- assert resp["pdf_url"] == "http://www.example.com/content/271/20/11761.full.pdf"
- assert resp["technique"] == "citation_pdf_url"
-
- with open("tests/files/plos_one_article.html", "rb") as f:
- resp = extract_fulltext_url(
- "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
- f.read(),
- )
- assert (
- resp["pdf_url"]
- == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
- )
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index ad8c22e..e14a452 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -50,6 +50,19 @@ def test_ingest_success(ingest_worker_pdf):
"base_url": "http://dummy-host/",
}
responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
responses.POST,
"http://dummy-spnv2/save",
status=200,
@@ -136,6 +149,19 @@ def test_ingest_landing(ingest_worker):
"base_url": "http://dummy-host/",
}
responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
responses.POST,
"http://dummy-spnv2/save",
status=200,
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index f3a5e46..add2c60 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -4,7 +4,7 @@ import pytest
import responses
from test_wayback import *
-from sandcrawler import CdxPartial, SavePageNowClient, SavePageNowError
+from sandcrawler import CdxPartial, SavePageNowBackoffError, SavePageNowClient, SavePageNowError
TARGET = "http://dummy-target.dummy"
JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
@@ -117,6 +117,19 @@ def spn_client():
def test_savepagenow_success(spn_client):
responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
responses.POST,
"http://dummy-spnv2/save",
status=200,
@@ -143,7 +156,7 @@ def test_savepagenow_success(spn_client):
resp = spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 4
+ assert len(responses.calls) == 5
assert resp.success is True
assert resp.status == "success"
@@ -157,6 +170,19 @@ def test_savepagenow_success(spn_client):
def test_savepagenow_remote_error(spn_client):
responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
responses.POST,
"http://dummy-spnv2/save",
status=200,
@@ -177,7 +203,7 @@ def test_savepagenow_remote_error(spn_client):
resp = spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 3
+ assert len(responses.calls) == 4
assert resp.success is False
assert resp.status == ERROR_BODY["status_ext"]
@@ -191,6 +217,19 @@ def test_savepagenow_remote_error(spn_client):
def test_savepagenow_500(spn_client):
responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
responses.POST,
"http://dummy-spnv2/save",
status=200,
@@ -206,13 +245,49 @@ def test_savepagenow_500(spn_client):
with pytest.raises(SavePageNowError):
spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 2
+ assert len(responses.calls) == 3
+
+
+@responses.activate
+def test_savepagenow_no_slots(spn_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 0,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+
+ with pytest.raises(SavePageNowBackoffError):
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 1
@responses.activate
def test_crawl_resource(spn_client, wayback_client):
responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
responses.POST,
"http://dummy-spnv2/save",
status=200,
@@ -244,7 +319,7 @@ def test_crawl_resource(spn_client, wayback_client):
print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"))
resp = spn_client.crawl_resource(TARGET, wayback_client)
- assert len(responses.calls) == 5
+ assert len(responses.calls) == 6
assert resp.hit is True
assert resp.status == "success"
diff --git a/python_hadoop/README.md b/python_hadoop/README.md
index 198c949..7866480 100644
--- a/python_hadoop/README.md
+++ b/python_hadoop/README.md
@@ -68,7 +68,7 @@ running on a devbox and GROBID running on a dedicated machine:
./extraction_cdx_grobid.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
tests/files/example.cdx
@@ -76,7 +76,7 @@ Running from the cluster (once a ./venv-current.tar.gz tarball exists):
./extraction_cdx_grobid.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
-r hadoop \
-c mrjob.conf \
@@ -90,13 +90,13 @@ running on a devbox:
./backfill_hbase_from_cdx.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
tests/files/example.cdx
Running from the cluster (once a ./venv-current.tar.gz tarball exists):
./backfill_hbase_from_cdx.py \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--hbase-table wbgrp-journal-extract-0-qa \
-r hadoop \
-c mrjob.conf \
diff --git a/sql/Makefile b/sql/Makefile
new file mode 100644
index 0000000..860addb
--- /dev/null
+++ b/sql/Makefile
@@ -0,0 +1,35 @@
+
+SHELL=/bin/bash -euo pipefail
+TODAY ?= $(shell date --iso --utc)
+DATADIR ?= /srv/sandcrawler/tasks/$(TODAY)
+DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S)
+DATABASE_URL ?= sandcrawler
+
+.PHONY: help
+help: ## Print info about all commands
+ @echo "Commands:"
+ @echo
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: create_datadir
+create_datadir:
+ mkdir -p $(DATADIR)/
+ sudo chmod a+rw $(DATADIR)/
+
+$(DATADIR)/.DB_DUMP:
+ sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip
+ mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump
+ touch $@
+
+.PHONY: database-snapshot
+database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot
+ @echo
+
+$(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP
+ ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))"
+ ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump
+ touch $@
+
+.PHONY: upload-database-snapshot
+upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org
+ @echo
diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql
new file mode 100644
index 0000000..698db7a
--- /dev/null
+++ b/sql/dump_reingest_bulk.sql
@@ -0,0 +1,31 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html')
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '24 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '181 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status like 'cdx-error'
+ OR ingest_file_result.status like 'petabox-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_old.sql b/sql/dump_reingest_old.sql
new file mode 100644
index 0000000..7473420
--- /dev/null
+++ b/sql/dump_reingest_old.sql
@@ -0,0 +1,36 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '6 day'::INTERVAL
+ -- AND ingest_request.created > NOW() - '181 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
+ OR ingest_request.ingest_request_source = 'unpaywall'
+ OR ingest_request.ingest_request_source = 'arxiv'
+ OR ingest_request.ingest_request_source = 'pmc'
+ OR ingest_request.ingest_request_source = 'doaj'
+ OR ingest_request.ingest_request_source = 'dblp')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status like 'no-capture'
+ -- OR ingest_file_result.status like 'cdx-error'
+ -- OR ingest_file_result.status like 'petabox-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql
index 725a404..dbeb199 100644
--- a/sql/dump_reingest_quarterly.sql
+++ b/sql/dump_reingest_quarterly.sql
@@ -1,20 +1,35 @@
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
AND ingest_file_result.hit = false
AND ingest_request.created < NOW() - '8 hour'::INTERVAL
AND ingest_request.created > NOW() - '91 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
+ OR ingest_request.ingest_request_source = 'unpaywall'
+ OR ingest_request.ingest_request_source = 'arxiv'
+ OR ingest_request.ingest_request_source = 'pmc'
+ OR ingest_request.ingest_request_source = 'doaj'
+ OR ingest_request.ingest_request_source = 'dblp')
AND (
ingest_file_result.status like 'spn2-%'
- OR ingest_file_result.status like 'cdx-error'
- OR ingest_file_result.status like 'wayback-error'
- OR ingest_file_result.status like 'wayback-content-error'
- OR ingest_file_result.status like 'petabox-error'
- OR ingest_file_result.status like 'gateway-timeout'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
)
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:filesize-limit'
@@ -29,3 +44,4 @@ COPY (
-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ROLLBACK;
diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql
index 6ef08c1..a83125c 100644
--- a/sql/dump_reingest_spn.sql
+++ b/sql/dump_reingest_spn.sql
@@ -1,19 +1,28 @@
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
AND ingest_file_result.hit = false
AND ingest_request.created < NOW() - '6 hour'::INTERVAL
AND ingest_request.created > NOW() - '180 day'::INTERVAL
AND ingest_request.ingest_request_source = 'savepapernow-web'
AND (
ingest_file_result.status like 'spn2-%'
- -- OR ingest_file_result.status like 'cdx-error'
- -- OR ingest_file_result.status like 'wayback-error'
- -- OR ingest_file_result.status like 'wayback-content-error'
- OR ingest_file_result.status like 'petabox-error'
- -- OR ingest_file_result.status like 'gateway-timeout'
+ -- OR ingest_file_result.status = 'cdx-error'
+ -- OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ -- OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
)
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:filesize-limit'
@@ -23,3 +32,5 @@ COPY (
AND ingest_file_result.status != 'spn2-error:network-authentication-required'
AND ingest_file_result.status != 'spn2-error:unknown'
) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql
new file mode 100644
index 0000000..b72a096
--- /dev/null
+++ b/sql/dump_reingest_terminalstatus.sql
@@ -0,0 +1,34 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '72 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '10 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 404
+ )
+ AND (
+ ingest_request.base_url LIKE 'https://doi.org/10.3390/%'
+ OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%'
+ OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%'
+ )
+) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json';
+
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+
+ROLLBACK;
diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql
index 65800eb..a019938 100644
--- a/sql/dump_reingest_weekly.sql
+++ b/sql/dump_reingest_weekly.sql
@@ -1,20 +1,30 @@
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
AND ingest_file_result.hit = false
AND ingest_request.created < NOW() - '8 hour'::INTERVAL
AND ingest_request.created > NOW() - '8 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container')
AND (
ingest_file_result.status like 'spn2-%'
- -- OR ingest_file_result.status like 'cdx-error'
- -- OR ingest_file_result.status like 'wayback-error'
- -- OR ingest_file_result.status like 'wayback-content-error'
- OR ingest_file_result.status like 'petabox-error'
- -- OR ingest_file_result.status like 'gateway-timeout'
+ -- OR ingest_file_result.status = 'cdx-error'
+ -- OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ -- OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
)
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:filesize-limit'
@@ -29,3 +39,4 @@ COPY (
-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ROLLBACK;
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 23a935e..33dba66 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -149,6 +149,7 @@ CREATE TABLE IF NOT EXISTS ingest_request (
PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
);
CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
+CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created);
CREATE TABLE IF NOT EXISTS ingest_file_result (
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh
new file mode 100755
index 0000000..d39a171
--- /dev/null
+++ b/sql/reingest_bulk.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_bulk_current.json
+
+cat /srv/sandcrawler/tasks/reingest_bulk_current.json \
+ | shuf \
+ | head -n1000000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/sql/reingest_old.sh b/sql/reingest_old.sh
new file mode 100755
index 0000000..96e5416
--- /dev/null
+++ b/sql/reingest_old.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_old.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_old_current.json
+
+cat /srv/sandcrawler/tasks/reingest_old_current.json \
+ | shuf \
+ | head -n1000000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
index 20fd82b..8a2996c 100755
--- a/sql/reingest_quarterly.sh
+++ b/sql/reingest_quarterly.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \
| shuf \
| head -n120000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh
index 6fb1e4b..c693a64 100755
--- a/sql/reingest_spn.sh
+++ b/sql/reingest_spn.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_spn.json \
| shuf \
| head -n60000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh
new file mode 100755
index 0000000..5cb6d51
--- /dev/null
+++ b/sql/reingest_terminalstatus_forcerecrawl.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json
+
+cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \
+ | shuf \
+ | head -n100000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
index 04ce39d..d2e2444 100755
--- a/sql/reingest_weekly.sh
+++ b/sql/reingest_weekly.sh
@@ -13,7 +13,7 @@ sudo -u sandcrawler pipenv run \
cat /srv/sandcrawler/tasks/reingest_weekly_current.json \
| shuf \
- | head -n60000 \
+ | head -n80000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt
new file mode 100644
index 0000000..bd20c5c
--- /dev/null
+++ b/sql/stats/2022-04-26_stats.txt
@@ -0,0 +1,432 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 416 GB | 10 GB | 426 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 58 GB | 41 GB | 99 GB
+ "public"."ingest_request" | 50 GB | 48 GB | 98 GB
+ "public"."ingest_file_result" | 42 GB | 48 GB | 90 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 37 GB | 34 GB | 71 GB
+ "public"."pdf_meta" | 21 GB | 7386 MB | 29 GB
+ "public"."grobid_refs" | 23 GB | 2516 MB | 26 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 3015 MB | 31 MB | 3046 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 192402128 | 271919997557597
+ (1 row)
+
+ # 271,919,997,557,597 -> ~272 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 191760695
+ text/html | 330351
+ application/octet-stream | 186696
+ application/xml | 42170
+ application/xhtml+xml | 31470
+ text/plain | 16449
+ application/jats+xml | 6902
+ application/gzip | 6681
+ | 6033
+ application/postscript | 4916
+ image/jpeg | 2901
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 934
+ application/x-bzip2 | 891
+ image/png | 476
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 374
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 294
+ application/x-compress | 274
+ video/mp4 | 150
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12831
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 130732381 | 162760251
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 149749828
+ warc/revisit | 10437210
+ application/octet-stream | 733161
+ text/html | 642992
+ text/xml | 525483
+ unk | 217642
+ application/postscript | 81127
+ application/save | 81023
+ binary/octet-stream | 67938
+ application/x-download | 41137
+ image/pdf | 39712
+ application/download | 37153
+ text/plain | 36342
+ application/force-download | 21496
+ multipart/form-data | 9792
+ application | 5366
+ application/x-octetstream | 5166
+ application/x-msdownload | 3851
+ .pdf | 3445
+ application/x-pdf | 3018
+ pdf | 1618
+ file | 1370
+ application/binary | 1354
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1047
+ application/unknown | 1001
+ 0 | 773
+ text/pdf | 729
+ application/blob | 673
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 123669603
+ (1 row)
+
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 115668412
+ 500 | 7995428
+ -4 | 5745
+ 503 | 18
+ (4 rows)
+
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 54780825
+ 0.5.5-fatcat | 48003940
+ | 12694404
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | mag | 43701948
+ pdf | doi | 40044585
+ pdf | doaj | 6016771
+ html | doaj | 3648181
+ pdf | arxiv | 2676200
+ pdf | pmc | 2402453
+ html | doi | 41492
+ xml | doaj | 20638
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 829
+ html | spn | 52
+ xml | doi | 1
+ xml | spn | 1
+ (16 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 20936949
+ pdf | doi | fatcat-ingest | 15590201
+ pdf | doaj | doaj | 6016771
+ html | doaj | doaj | 3648181
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 691405
+ pdf | pmc | fatcat-ingest | 297646
+ pdf | pmc | fatcat-changelog | 75982
+ html | doi | fatcat-ingest | 37904
+ xml | doaj | doaj | 20638
+ html | doi | fatcat-changelog | 3534
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1562
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 829
+ html | doi | savepapernow-web | 54
+ html | spn | savepapernow-web | 52
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 3
+ xml | doi | savepapernow-web | 1
+ xml | spn | savepapernow-web | 1
+ (26 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doaj | 1619621
+ html | doaj | 1208412
+ pdf | mag | 167653
+ pdf | oai | 15282
+ xml | doaj | 11196
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (7 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 15968290 | 0.312
+ pdf | unpaywall | 43932525 | 32618045 | 0.742
+ pdf | mag | 43701948 | 32662926 | 0.747
+ pdf | doi | 40044738 | 10925369 | 0.273
+ pdf | doaj | 6016771 | 3042569 | 0.506
+ html | doaj | 3648181 | 344208 | 0.094
+ pdf | arxiv | 2676206 | 2269708 | 0.848
+ pdf | pmc | 2402453 | 1855679 | 0.772
+ html | doi | 41492 | 1739 | 0.042
+ xml | doaj | 20638 | 6899 | 0.334
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 829 | 616 | 0.743
+ html | spn | 52 | 7 | 0.135
+ xml | doi | 1 | 0 | 0.000
+ xml | spn | 1 | 0 | 0.000
+ (16 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+---------------------------------+----------
+ pdf | success | 85709322
+ pdf | no-pdf-link | 29713304
+ pdf | no-capture | 26632191
+ pdf | redirect-loop | 10979145
+ pdf | terminal-bad-status | 4977000
+ pdf | link-loop | 3434877
+ pdf | skip-url-blocklist | 3114258
+ pdf | blocked-cookie | 2156835
+ html | wrong-scope | 1126911
+ pdf | wrong-mimetype | 980546
+ pdf | gateway-timeout | 651562
+ pdf | spn2-cdx-lookup-failure | 484016
+ pdf | spn2-backoff | 399382
+ pdf | cdx-error | 373964
+ pdf | wayback-content-error | 354370
+ html | success | 345860
+ pdf | null-body | 336182
+ pdf | spn2-error:500 | 309755
+ pdf | forbidden | 291175
+ pdf | not-found | 275560
+ pdf | too-many-redirects | 262312
+ html | unknown-scope | 230352
+ html | redirect-loop | 226596
+ html | html-resource-no-capture | 205646
+ html | no-capture | 164014
+ component | spn2-cdx-lookup-failure | 148825
+ component | wrong-mimetype | 130344
+ html | null-body | 100296
+ pdf | wayback-error | 94286
+ pdf | spn2-wayback-error | 81365
+ component | no-capture | 75278
+ pdf | spn2-error | 69830
+ pdf | skip-wall | 57744
+ pdf | spn2-error:too-many-redirects | 53808
+ pdf | remote-server-error | 41286
+ pdf | petabox-error | 38800
+ pdf | invalid-host-resolution | 37337
+ pdf | read-timeout | 36872
+ component | spn2-backoff | 33217
+ pdf | empty-blob | 27946
+ component | spn2-error | 24078
+ pdf | spn2-error:unknown | 23697
+ component | gateway-timeout | 23139
+ html | wrong-mimetype | 22731
+ html | wayback-content-error | 20507
+ pdf | spn2-error:host-crawling-paused | 19900
+ pdf | bad-redirect | 19183
+ html | terminal-bad-status | 13354
+ component | blocked-cookie | 12287
+ component | spn2-error:500 | 11271
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 38144779
+ pdf | | 32762240
+ pdf | 301 | 9433087
+ html | 200 | 1716127
+ pdf | 403 | 1416632
+ pdf | 302 | 1134668
+ pdf | 404 | 888853
+ pdf | 401 | 746311
+ pdf | 503 | 655894
+ pdf | 400 | 531479
+ component | | 337603
+ pdf | 500 | 247944
+ html | 301 | 224237
+ html | | 167194
+ pdf | 303 | 135048
+ component | 200 | 130663
+ pdf | 429 | 93489
+ pdf | 410 | 67392
+ pdf | 420 | 26722
+ pdf | 502 | 18770
+ pdf | 409 | 15152
+ pdf | 509 | 15113
+ pdf | 999 | 11747
+ html | 404 | 9879
+ pdf | 307 | 8895
+ pdf | 412 | 7053
+ pdf | 308 | 6627
+ pdf | 202 | 5289
+ xml | 200 | 2540
+ html | 500 | 2480
+ pdf | 520 | 2220
+ pdf | 521 | 1844
+ pdf | 206 | 1739
+ html | 302 | 1407
+ pdf | 504 | 1146
+ html | 303 | 1123
+ pdf | 421 | 986
+ pdf | 406 | 938
+ pdf | 204 | 498
+ pdf | 505 | 468
+ pdf | 300 | 436
+ pdf | 508 | 422
+ pdf | 426 | 405
+ html | 429 | 402
+ html | 403 | 398
+ pdf | 432 | 366
+ component | 301 | 294
+ pdf | 405 | 210
+ pdf | 226 | 166
+ component | 302 | 128
+ (50 rows)
+
diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt
new file mode 100644
index 0000000..864abd4
--- /dev/null
+++ b/sql/stats/2022-04-27_crawl_changelog.txt
@@ -0,0 +1,191 @@
+ domain | status | count
+--------------------------------------+-------------------------+--------
+ academic.oup.com | | 1243
+ academic.oup.com | spn2-cdx-lookup-failure | 990
+ aip.scitation.org | | 313
+ aip.scitation.org | spn2-cdx-lookup-failure | 224
+ ajps.uomustansiriyah.edu.iq | | 235
+ apps.crossref.org | | 1329
+ apps.crossref.org | spn2-cdx-lookup-failure | 942
+ apps.crossref.org | no-pdf-link | 387
+ archaeologydataservice.ac.uk | | 422
+ archaeologydataservice.ac.uk | spn2-cdx-lookup-failure | 289
+ arxiv.org | | 3512
+ arxiv.org | spn2-cdx-lookup-failure | 2319
+ arxiv.org | success | 1177
+ assets.researchsquare.com | | 571
+ assets.researchsquare.com | spn2-cdx-lookup-failure | 322
+ assets.researchsquare.com | success | 249
+ brill.com | | 397
+ brill.com | spn2-cdx-lookup-failure | 265
+ cla.berkeley.edu | | 239
+ classiques-garnier.com | | 249
+ cyberleninka.ru | | 340
+ cyberleninka.ru | spn2-cdx-lookup-failure | 244
+ dergipark.org.tr | | 468
+ dergipark.org.tr | spn2-cdx-lookup-failure | 333
+ dl.acm.org | | 592
+ dl.acm.org | spn2-cdx-lookup-failure | 470
+ doi.ala.org.au | | 288
+ doi.ala.org.au | spn2-cdx-lookup-failure | 220
+ doi.org | | 1107
+ doi.org | terminal-bad-status | 679
+ doi.org | spn2-cdx-lookup-failure | 415
+ downloads.hindawi.com | | 279
+ downloads.hindawi.com | success | 267
+ edbs.uomustansiriyah.edu.iq | | 294
+ edbs.uomustansiriyah.edu.iq | spn2-cdx-lookup-failure | 209
+ elibrary.kdpu.edu.ua | | 320
+ elibrary.kdpu.edu.ua | spn2-cdx-lookup-failure | 233
+ elibrary.ru | | 722
+ elibrary.ru | spn2-cdx-lookup-failure | 505
+ europepmc.org | | 986
+ europepmc.org | spn2-cdx-lookup-failure | 681
+ europepmc.org | success | 291
+ figshare.com | | 377
+ figshare.com | spn2-cdx-lookup-failure | 328
+ fjfsdata01prod.blob.core.windows.net | | 255
+ fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure | 216
+ hammer.purdue.edu | | 224
+ ieeexplore.ieee.org | | 3904
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 2654
+ ieeexplore.ieee.org | gateway-timeout | 792
+ ieeexplore.ieee.org | spn2-backoff | 419
+ journals.eco-vector.com | | 428
+ journals.eco-vector.com | spn2-cdx-lookup-failure | 306
+ journals.lww.com | | 727
+ journals.lww.com | spn2-cdx-lookup-failure | 622
+ journals.openedition.org | | 806
+ journals.openedition.org | spn2-cdx-lookup-failure | 554
+ journals.plos.org | | 348
+ journals.plos.org | spn2-cdx-lookup-failure | 244
+ kiss.kstudy.com | | 226
+ kluwerlawonline.com | | 723
+ kluwerlawonline.com | spn2-cdx-lookup-failure | 489
+ kluwerlawonline.com | link-loop | 203
+ linkinghub.elsevier.com | | 401
+ linkinghub.elsevier.com | spn2-backoff | 342
+ mdpi-res.com | | 1463
+ mdpi-res.com | success | 1337
+ muse.jhu.edu | | 346
+ muse.jhu.edu | spn2-cdx-lookup-failure | 253
+ onepetro.org | | 363
+ onepetro.org | spn2-cdx-lookup-failure | 284
+ online.ucpress.edu | | 1620
+ online.ucpress.edu | spn2-cdx-lookup-failure | 1511
+ onlinelibrary.wiley.com | | 2913
+ onlinelibrary.wiley.com | spn2-cdx-lookup-failure | 2109
+ onlinelibrary.wiley.com | terminal-bad-status | 787
+ opendata.uni-halle.de | | 519
+ opendata.uni-halle.de | spn2-cdx-lookup-failure | 343
+ osf.io | | 1554
+ osf.io | spn2-cdx-lookup-failure | 1350
+ papers.ssrn.com | | 2207
+ papers.ssrn.com | spn2-cdx-lookup-failure | 1727
+ papers.ssrn.com | link-loop | 457
+ psycharchives.org | | 384
+ psycharchives.org | spn2-cdx-lookup-failure | 283
+ publons.com | | 493
+ publons.com | spn2-cdx-lookup-failure | 348
+ pubs.acs.org | | 1240
+ pubs.acs.org | spn2-cdx-lookup-failure | 881
+ pubs.acs.org | terminal-bad-status | 298
+ pubs.rsc.org | | 603
+ pubs.rsc.org | spn2-cdx-lookup-failure | 460
+ repositories.lib.utexas.edu | | 1861
+ repositories.lib.utexas.edu | spn2-cdx-lookup-failure | 1288
+ repositories.lib.utexas.edu | terminal-bad-status | 523
+ s3-eu-west-1.amazonaws.com | | 216
+ sage.figshare.com | | 374
+ sage.figshare.com | spn2-cdx-lookup-failure | 309
+ scholar.dkyobobook.co.kr | | 220
+ scholarworks.gsu.edu | | 749
+ scholarworks.gsu.edu | spn2-cdx-lookup-failure | 577
+ tandf.figshare.com | | 214
+ www.atlantis-press.com | | 338
+ www.atlantis-press.com | spn2-cdx-lookup-failure | 214
+ www.cairn.info | | 782
+ www.cairn.info | spn2-cdx-lookup-failure | 541
+ www.cambridge.org | | 2325
+ www.cambridge.org | spn2-cdx-lookup-failure | 1787
+ www.cambridge.org | no-pdf-link | 300
+ www.cell.com | | 213
+ www.concrete.org | | 476
+ www.concrete.org | spn2-cdx-lookup-failure | 340
+ www.dbpia.co.kr | | 375
+ www.dbpia.co.kr | spn2-cdx-lookup-failure | 275
+ www.degruyter.com | | 3849
+ www.degruyter.com | spn2-cdx-lookup-failure | 2969
+ www.degruyter.com | no-pdf-link | 712
+ www.dib.ie | | 1100
+ www.dib.ie | spn2-cdx-lookup-failure | 1038
+ www.e-periodica.ch | | 821
+ www.e-periodica.ch | spn2-cdx-lookup-failure | 620
+ www.e-periodica.ch | no-pdf-link | 201
+ www.elibrary.ru | | 401
+ www.elibrary.ru | spn2-cdx-lookup-failure | 281
+ www.emerald.com | | 390
+ www.emerald.com | spn2-cdx-lookup-failure | 275
+ www.eurekaselect.com | | 275
+ www.frontiersin.org | | 1266
+ www.frontiersin.org | spn2-cdx-lookup-failure | 1025
+ www.hanspub.org | | 229
+ www.hindawi.com | | 604
+ www.hindawi.com | spn2-cdx-lookup-failure | 594
+ www.inderscience.com | | 201
+ www.jstage.jst.go.jp | | 1094
+ www.jstage.jst.go.jp | spn2-cdx-lookup-failure | 807
+ www.jstage.jst.go.jp | success | 206
+ www.mdpi.com | | 4340
+ www.mdpi.com | spn2-cdx-lookup-failure | 4258
+ www.nomos-elibrary.de | | 2749
+ www.nomos-elibrary.de | spn2-cdx-lookup-failure | 1909
+ www.nomos-elibrary.de | redirect-loop | 819
+ www.osti.gov | | 275
+ www.oxfordhandbooks.com | | 248
+ www.oxfordhandbooks.com | spn2-cdx-lookup-failure | 224
+ www.pdcnet.org | | 217
+ www.researchsquare.com | | 483
+ www.researchsquare.com | spn2-cdx-lookup-failure | 317
+ www.scielo.br | | 319
+ www.scielo.br | spn2-cdx-lookup-failure | 222
+ www.sciencedirect.com | | 3384
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 3267
+ www.spiedigitallibrary.org | | 441
+ www.spiedigitallibrary.org | spn2-cdx-lookup-failure | 327
+ www.tandfonline.com | | 2401
+ www.tandfonline.com | spn2-cdx-lookup-failure | 1552
+ www.tandfonline.com | no-pdf-link | 303
+ www.tandfonline.com | blocked-cookie | 250
+ www.taylorfrancis.com | | 1232
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 908
+ www.thieme-connect.de | | 520
+ www.thieme-connect.de | spn2-cdx-lookup-failure | 366
+ www.worldscientific.com | | 383
+ www.worldscientific.com | spn2-cdx-lookup-failure | 276
+ zenodo.org | | 10625
+ zenodo.org | spn2-cdx-lookup-failure | 7777
+ zenodo.org | success | 1574
+ zenodo.org | no-pdf-link | 1160
+ zivahub.uct.ac.za | | 3428
+ zivahub.uct.ac.za | spn2-cdx-lookup-failure | 2845
+ zivahub.uct.ac.za | no-pdf-link | 583
+ | | 130491
+ | spn2-cdx-lookup-failure | 95169
+ | success | 13354
+ | no-pdf-link | 9621
+ | terminal-bad-status | 3385
+ | spn2-backoff | 2396
+ | redirect-loop | 2216
+ | link-loop | 1850
+ | gateway-timeout | 1061
+ | spn2-error:blocked-url | 428
+ | blocked-cookie | 415
+ | spn2-error | 246
+(182 rows)
+
+----
+
+The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a
+week or two, when crawling and retries are running smoothly, and see what
+things look like then.
diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt
new file mode 100644
index 0000000..8d98217
--- /dev/null
+++ b/sql/stats/2022-05-11_crawl_changelog.txt
@@ -0,0 +1,410 @@
+ domain | status | count
+-----------------------------------------------------------------+-------------------------+--------
+ academic.oup.com | | 2210
+ academic.oup.com | no-pdf-link | 1350
+ academic.oup.com | bad-redirect | 510
+ academiccommons.columbia.edu | | 379
+ academiccommons.columbia.edu | success | 339
+ aip.scitation.org | | 762
+ aip.scitation.org | terminal-bad-status | 430
+ apps.crossref.org | | 9894
+ apps.crossref.org | no-pdf-link | 9886
+ apps.euskadi.eus | | 242
+ apps.euskadi.eus | no-pdf-link | 240
+ arxiv.org | | 44889
+ arxiv.org | success | 28781
+ arxiv.org | spn2-backoff | 7975
+ arxiv.org | terminal-bad-status | 4508
+ arxiv.org | spn2-cdx-lookup-failure | 2010
+ arxiv.org | redirect-loop | 619
+ arxiv.org | no-pdf-link | 242
+ arxiv.org | spn2-error | 236
+ asa.scitation.org | | 356
+ asa.scitation.org | terminal-bad-status | 299
+ asmedigitalcollection.asme.org | | 240
+ assets.cureus.com | | 336
+ assets.cureus.com | success | 335
+ assets.researchsquare.com | | 1042
+ assets.researchsquare.com | success | 993
+ av.tib.eu | | 205
+ av.tib.eu | no-pdf-link | 203
+ bibliographie.uni-tuebingen.de | | 213
+ bibliographie.uni-tuebingen.de | no-pdf-link | 211
+ biorxiv.org | redirect-loop | 217
+ biorxiv.org | | 217
+ books.openedition.org | | 691
+ books.openedition.org | no-pdf-link | 687
+ boris.unibe.ch | | 525
+ boris.unibe.ch | success | 466
+ bridges.monash.edu | | 663
+ bridges.monash.edu | no-pdf-link | 647
+ brill.com | | 860
+ brill.com | success | 434
+ chemrxiv.org | | 201
+ classiques-garnier.com | | 242
+ content.iospress.com | | 325
+ content.iospress.com | link-loop | 247
+ core.tdar.org | | 216
+ core.tdar.org | no-pdf-link | 211
+ cyberleninka.ru | | 646
+ cyberleninka.ru | success | 620
+ d197for5662m48.cloudfront.net | | 263
+ d197for5662m48.cloudfront.net | success | 262
+ dergipark.org.tr | | 891
+ dergipark.org.tr | success | 526
+ dergipark.org.tr | no-pdf-link | 261
+ digi.ub.uni-heidelberg.de | | 427
+ digi.ub.uni-heidelberg.de | no-pdf-link | 427
+ direct.mit.edu | | 268
+ direct.mit.edu | no-pdf-link | 208
+ dl.acm.org | | 1719
+ dl.acm.org | success | 829
+ dl.acm.org | no-pdf-link | 546
+ dl.acm.org | terminal-bad-status | 205
+ dlc.library.columbia.edu | | 385
+ dlc.library.columbia.edu | terminal-bad-status | 319
+ doi.ala.org.au | | 724
+ doi.ala.org.au | no-pdf-link | 721
+ doi.apa.org | | 214
+ doi.org | | 3390
+ doi.org | terminal-bad-status | 2938
+ doi.org | redirect-loop | 233
+ doi.org | spn2-wayback-error | 208
+ doi.usp.org | | 325
+ doi.usp.org | no-pdf-link | 324
+ downloads.hindawi.com | | 1439
+ downloads.hindawi.com | success | 1436
+ du.diva-portal.org | | 589
+ du.diva-portal.org | success | 586
+ econtents.bc.unicamp.br | | 310
+ econtents.bc.unicamp.br | success | 310
+ ediss.uni-goettingen.de | | 728
+ ediss.uni-goettingen.de | success | 425
+ elibrary.kdpu.edu.ua | | 907
+ elibrary.kdpu.edu.ua | bad-redirect | 712
+ elibrary.ru | | 925
+ elibrary.ru | terminal-bad-status | 492
+ elibrary.ru | bad-redirect | 230
+ elibrary.vdi-verlag.de | | 393
+ elifesciences.org | | 296
+ elifesciences.org | success | 276
+ europepmc.org | | 3024
+ europepmc.org | success | 2541
+ europepmc.org | terminal-bad-status | 463
+ figshare.com | | 493
+ figshare.com | no-pdf-link | 440
+ files.osf.io | | 883
+ files.osf.io | success | 686
+ fjfsdata01prod.blob.core.windows.net | | 3869
+ fjfsdata01prod.blob.core.windows.net | success | 3818
+ ieeexplore.ieee.org | | 10854
+ ieeexplore.ieee.org | gateway-timeout | 5495
+ ieeexplore.ieee.org | spn2-backoff | 1662
+ ieeexplore.ieee.org | no-pdf-link | 1417
+ ieeexplore.ieee.org | success | 1410
+ ieeexplore.ieee.org | redirect-loop | 768
+ iiif.crossasia.org | | 7608
+ iiif.crossasia.org | no-pdf-link | 7568
+ ikee.lib.auth.gr | | 450
+ ikee.lib.auth.gr | success | 332
+ ins.journals.ekb.eg | | 212
+ iopscience.iop.org | | 268
+ jamanetwork.com | | 333
+ journals.aps.org | | 414
+ journals.asm.org | | 242
+ journals.flvc.org | | 245
+ journals.flvc.org | success | 242
+ journals.healio.com | | 755
+ journals.healio.com | terminal-bad-status | 668
+ journals.lincoln.ac.nz | | 244
+ journals.lincoln.ac.nz | success | 239
+ journals.lww.com | | 1772
+ journals.lww.com | link-loop | 1425
+ journals.lww.com | spn2-backoff | 209
+ journals.openedition.org | | 1192
+ journals.openedition.org | redirect-loop | 467
+ journals.openedition.org | success | 451
+ journals.plos.org | | 771
+ journals.plos.org | success | 750
+ journals.ub.uni-heidelberg.de | | 787
+ journals.ub.uni-heidelberg.de | success | 741
+ kazanmedjournal.ru | | 240
+ kazanmedjournal.ru | success | 231
+ kiss.kstudy.com | | 219
+ kiss.kstudy.com | no-pdf-link | 218
+ kluwerlawonline.com | | 444
+ kluwerlawonline.com | link-loop | 402
+ libraetd.lib.virginia.edu | | 362
+ libraetd.lib.virginia.edu | no-pdf-link | 361
+ link.springer.com | | 305
+ linkinghub.elsevier.com | | 568
+ linkinghub.elsevier.com | spn2-backoff | 545
+ ltu-figshare-repo.s3.aarnet.edu.au | | 269
+ ltu-figshare-repo.s3.aarnet.edu.au | success | 268
+ mausamjournal.imd.gov.in | | 202
+ mdpi-res.com | | 8892
+ mdpi-res.com | success | 8863
+ mededpublish.org | | 1900
+ mededpublish.org | no-pdf-link | 1900
+ meetingorganizer.copernicus.org | | 276
+ meetingorganizer.copernicus.org | no-pdf-link | 271
+ muse.jhu.edu | | 1047
+ muse.jhu.edu | terminal-bad-status | 755
+ muse.jhu.edu | link-loop | 203
+ online.ucpress.edu | | 358
+ online.ucpress.edu | link-loop | 212
+ onlinelibrary.wiley.com | | 5813
+ onlinelibrary.wiley.com | terminal-bad-status | 4587
+ onlinelibrary.wiley.com | spn2-wayback-error | 614
+ onlinelibrary.wiley.com | blocked-cookie | 381
+ open.library.ubc.ca | | 206
+ opendata.uni-halle.de | | 1768
+ opendata.uni-halle.de | success | 1215
+ opendata.uni-halle.de | wrong-mimetype | 260
+ opendata2.uni-halle.de | | 206
+ opg.optica.org | | 205
+ osf.io | | 2949
+ osf.io | no-pdf-link | 2404
+ osf.io | spn2-backoff | 299
+ papers.ssrn.com | | 3962
+ papers.ssrn.com | link-loop | 3800
+ peerj.com | | 273
+ preprints.jmir.org | | 275
+ preprints.jmir.org | cdx-error | 255
+ publikationen.bibliothek.kit.edu | | 213
+ publons.com | | 593
+ publons.com | no-pdf-link | 590
+ pubs.acs.org | | 2288
+ pubs.acs.org | terminal-bad-status | 1841
+ pubs.acs.org | spn2-wayback-error | 210
+ pubs.rsc.org | | 1698
+ pubs.rsc.org | bad-redirect | 811
+ pubs.rsc.org | link-loop | 352
+ pubs.rsc.org | success | 307
+ radiopaedia.org | | 220
+ read.dukeupress.edu | | 303
+ repositories.lib.utexas.edu | | 1570
+ repositories.lib.utexas.edu | bad-redirect | 513
+ repositories.lib.utexas.edu | spn2-backoff | 383
+ repositories.lib.utexas.edu | gateway-timeout | 379
+ repositories.lib.utexas.edu | terminal-bad-status | 282
+ repository.uj.ac.za | | 489
+ repository.uj.ac.za | no-pdf-link | 365
+ repository.unsworks.unsw.edu.au | | 397
+ repository.urosario.edu.co | | 2429
+ repository.urosario.edu.co | success | 1648
+ repository.urosario.edu.co | bad-redirect | 613
+ rex.libraries.wsu.edu | no-pdf-link | 241
+ rex.libraries.wsu.edu | | 241
+ rsdjournal.org | | 208
+ rsdjournal.org | success | 208
+ s3-ap-southeast-2.amazonaws.com | | 282
+ s3-ap-southeast-2.amazonaws.com | success | 277
+ s3-eu-west-1.amazonaws.com | | 4615
+ s3-eu-west-1.amazonaws.com | success | 4593
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 240
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 237
+ sage.figshare.com | | 415
+ sage.figshare.com | no-pdf-link | 385
+ scholar.dkyobobook.co.kr | | 512
+ scholar.dkyobobook.co.kr | no-pdf-link | 509
+ scholarlypublishingcollective.org | | 287
+ scholarworks.gsu.edu | | 1132
+ scholarworks.gsu.edu | success | 1000
+ scholarworks.iupui.edu | | 205
+ scholarworks.umass.edu | | 417
+ scholarworks.umass.edu | success | 400
+ sciencescholar.us | | 404
+ secure.jbs.elsevierhealth.com | | 727
+ secure.jbs.elsevierhealth.com | terminal-bad-status | 722
+ tandf.figshare.com | | 354
+ tandf.figshare.com | no-pdf-link | 342
+ unsworks.unsw.edu.au | | 408
+ unsworks.unsw.edu.au | spn2-cdx-lookup-failure | 342
+ valep.vc.univie.ac.at | no-pdf-link | 737
+ valep.vc.univie.ac.at | | 737
+ watermark.silverchair.com | | 1604
+ watermark.silverchair.com | success | 1598
+ wayf.switch.ch | | 215
+ wayf.switch.ch | no-pdf-link | 213
+ www.ahajournals.org | | 438
+ www.ahajournals.org | no-pdf-link | 306
+ www.ahbps.org | | 316
+ www.ahbps.org | success | 312
+ www.atenaeditora.com.br | | 390
+ www.atenaeditora.com.br | terminal-bad-status | 333
+ www.atlantis-press.com | | 914
+ www.atlantis-press.com | success | 901
+ www.atsjournals.org | | 1245
+ www.atsjournals.org | success | 1189
+ www.biorxiv.org | | 712
+ www.biorxiv.org | success | 670
+ www.bloomsburycollections.com | | 982
+ www.bloomsburycollections.com | terminal-bad-status | 566
+ www.cahiers-clsl.ch | | 305
+ www.cahiers-clsl.ch | success | 298
+ www.cairn.info | | 1799
+ www.cairn.info | no-pdf-link | 662
+ www.cairn.info | link-loop | 487
+ www.cairn.info | success | 355
+ www.cairn.info | terminal-bad-status | 267
+ www.cambridge.org | | 3258
+ www.cambridge.org | no-pdf-link | 1682
+ www.cambridge.org | success | 682
+ www.cambridge.org | bad-redirect | 404
+ www.cambridge.org | link-loop | 302
+ www.dbpia.co.kr | | 763
+ www.dbpia.co.kr | no-pdf-link | 443
+ www.dbpia.co.kr | redirect-loop | 287
+ www.degruyter.com | | 12655
+ www.degruyter.com | no-pdf-link | 9112
+ www.degruyter.com | success | 2898
+ www.degruyter.com | spn2-backoff | 507
+ www.dib.ie | | 1381
+ www.dib.ie | no-pdf-link | 1378
+ www.dovepress.com | | 231
+ www.dovepress.com | success | 216
+ www.e-manuscripta.ch | | 767
+ www.e-manuscripta.ch | success | 399
+ www.e-periodica.ch | | 1406
+ www.e-periodica.ch | no-pdf-link | 1402
+ www.e-rara.ch | no-pdf-link | 251
+ www.e-rara.ch | | 251
+ www.editoracientifica.org | no-pdf-link | 205
+ www.editoracientifica.org | | 205
+ www.elgaronline.com | | 427
+ www.elibrary.ru | | 616
+ www.elibrary.ru | terminal-bad-status | 364
+ www.elibrary.ru | no-pdf-link | 216
+ www.emerald.com | | 862
+ www.emerald.com | no-pdf-link | 724
+ www.endocrine-abstracts.org | | 1907
+ www.endocrine-abstracts.org | no-pdf-link | 1905
+ www.eurekaselect.com | | 285
+ www.eurekaselect.com | link-loop | 246
+ www.even3.com.br | | 233
+ www.frontiersin.org | | 585
+ www.frontiersin.org | spn2-backoff | 436
+ www.humankineticslibrary.com | no-pdf-link | 207
+ www.humankineticslibrary.com | | 207
+ www.igi-global.com | | 1600
+ www.igi-global.com | no-pdf-link | 1199
+ www.igi-global.com | bad-redirect | 258
+ www.inderscience.com | | 385
+ www.inderscience.com | no-pdf-link | 365
+ www.inderscienceonline.com | | 202
+ www.ingentaconnect.com | | 450
+ www.ingentaconnect.com | no-pdf-link | 260
+ www.jstage.jst.go.jp | | 1248
+ www.jstage.jst.go.jp | success | 870
+ www.karger.com | | 313
+ www.liebertpub.com | | 271
+ www.liebertpub.com | no-pdf-link | 241
+ www.nicecjournal.co.uk | | 274
+ www.nicecjournal.co.uk | success | 274
+ www.nomos-elibrary.de | | 1771
+ www.nomos-elibrary.de | no-pdf-link | 788
+ www.nomos-elibrary.de | redirect-loop | 506
+ www.nomos-elibrary.de | bad-redirect | 207
+ www.osti.gov | | 381
+ www.osti.gov | link-loop | 326
+ www.persee.fr | | 277
+ www.preprints.org | | 225
+ www.preprints.org | success | 225
+ www.protocols.io | | 770
+ www.protocols.io | success | 485
+ www.repository.cam.ac.uk | | 510
+ www.repository.cam.ac.uk | bad-redirect | 213
+ www.research-collection.ethz.ch | | 416
+ www.research-collection.ethz.ch | bad-redirect | 249
+ www.researchsquare.com | | 1121
+ www.researchsquare.com | bad-redirect | 985
+ www.scielo.br | | 828
+ www.scielo.br | success | 641
+ www.sciencedirect.com | | 8567
+ www.sciencedirect.com | terminal-bad-status | 5773
+ www.sciencedirect.com | spn2-wayback-error | 1590
+ www.sciencedirect.com | no-pdf-link | 576
+ www.sciencedirect.com | spn2-backoff | 479
+ www.sciendo.com | | 257
+ www.sciendo.com | success | 222
+ www.scitepress.org | | 381
+ www.scitepress.org | no-pdf-link | 377
+ www.spiedigitallibrary.org | | 1061
+ www.spiedigitallibrary.org | bad-redirect | 571
+ www.spiedigitallibrary.org | gateway-timeout | 233
+ www.tandfonline.com | | 4934
+ www.tandfonline.com | no-pdf-link | 2088
+ www.tandfonline.com | terminal-bad-status | 1282
+ www.tandfonline.com | blocked-cookie | 757
+ www.tandfonline.com | redirect-loop | 488
+ www.tandfonline.com | spn2-wayback-error | 202
+ www.taylorfrancis.com | | 3979
+ www.taylorfrancis.com | link-loop | 1928
+ www.taylorfrancis.com | no-pdf-link | 1840
+ www.techniques-ingenieur.fr | | 354
+ www.techniques-ingenieur.fr | no-pdf-link | 353
+ www.thieme-connect.de | | 1987
+ www.thieme-connect.de | no-pdf-link | 949
+ www.thieme-connect.de | link-loop | 869
+ www.tib.eu | no-pdf-link | 315
+ www.tib.eu | | 315
+ www.un-ilibrary.org | no-pdf-link | 352
+ www.un-ilibrary.org | | 352
+ www.worldscientific.com | | 668
+ www.worldscientific.com | no-pdf-link | 629
+ www.zora.uzh.ch | | 318
+ zenodo.org | | 46585
+ zenodo.org | no-pdf-link | 29519
+ zenodo.org | success | 14768
+ zenodo.org | terminal-bad-status | 810
+ zenodo.org | wrong-mimetype | 691
+ zenodo.org | spn2-cdx-lookup-failure | 395
+ zenodo.org | spn2-backoff | 294
+ zivahub.uct.ac.za | | 1909
+ zivahub.uct.ac.za | no-pdf-link | 1880
+ zop.zb.uzh.ch | | 228
+ zop.zb.uzh.ch | success | 217
+ | | 365582
+ | success | 141497 38.7%
+ | no-pdf-link | 120852 33.0%
+ | terminal-bad-status | 31900 8.7%
+ | spn2-backoff | 16979 4.6%
+ | link-loop | 13624 3.7%
+ | bad-redirect | 8736
+ | redirect-loop | 7405
+ | gateway-timeout | 6997
+ | spn2-cdx-lookup-failure | 5146
+ | spn2-wayback-error | 3708
+ | wrong-mimetype | 2158
+ | blocked-cookie | 1942
+ | spn2-error:blocked-url | 1733
+ | wayback-error | 1063
+ | spn2-error | 647
+ | spn2-error:500 | 265
+ | cdx-error | 257
+(383 rows)
+
+----
+
+365k in 7 days is about 52k a day, which is about expected. Around 5-7% need
+retries.
+
+important changes:
+- biorxiv.org: needs fix and then retries
+- academic.oup.com: should probably skip
+- apps.crossref.org: need to handle this in code
+- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/`
+- doi.org: investigate redirect-loop and terminal-bad-status
+- osf.io: not getting PDFs
+- papers.ssrn.com: why are these attempted?
+- publons.com: not getting PDFs; special case these?
+- www.sciencedirect.com: not working at all?
+
+smaller:
+- bridges.monash.edu: fix, then retry?
+- dl.acm.org: some broader retries?
+- figshare.com: still some attempts, but almost all no-pdf-link
+- onlinelibrary.wiley.com: getting blocked broadly?
+- www.endocrine-abstracts.org: HTML content?
+- www.igi-global.com: no-pdf-link
diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt
new file mode 100644
index 0000000..be2b30c
--- /dev/null
+++ b/sql/stats/2022-09-06_stats.txt
@@ -0,0 +1,438 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 62 GB | 44 GB | 106 GB
+ "public"."ingest_request" | 51 GB | 50 GB | 101 GB
+ "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB
+ "public"."file_meta" | 39 GB | 39 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7469 MB | 66 MB | 7535 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 198175106 | 282695671015403
+ (1 row)
+
+ 198 million files, 282 TBytes.
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 197021437
+ text/html | 830331
+ application/octet-stream | 186669
+ application/xml | 42170
+ application/xhtml+xml | 38207
+ text/plain | 16471
+ application/jats+xml | 10385
+ application/gzip | 6681
+ | 6032
+ application/postscript | 4916
+ image/jpeg | 4522
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 946
+ application/x-bzip2 | 891
+ image/png | 659
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382
+ application/x-compress | 274
+ video/mp4 | 218
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12800
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 137283420 | 172140506
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 157465613
+ warc/revisit | 11337336
+ text/html | 1137208
+ application/octet-stream | 950380
+ text/xml | 528965
+ unk | 253294
+ application/postscript | 81130
+ application/save | 81069
+ binary/octet-stream | 68942
+ application/x-download | 42717
+ application/download | 40628
+ image/pdf | 39904
+ text/plain | 36445
+ application/force-download | 24148
+ multipart/form-data | 10972
+ application | 5409
+ application/x-octetstream | 5192
+ application/x-msdownload | 3854
+ .pdf | 3518
+ application/x-pdf | 3061
+ application/octet | 1792
+ pdf | 1757
+ application/binary | 1399
+ file | 1373
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1087
+ application/unknown | 1005
+ 0 | 773
+ text/pdf | 729
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 129001717
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 120797098
+ 500 | 8198783
+ -4 | 5802
+ 503 | 36
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 60469462
+ 0.5.5-fatcat | 47472904
+ | 12665498
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | doi | 43852308
+ pdf | mag | 43701948
+ pdf | doaj | 6534341
+ html | doaj | 3987669
+ pdf | arxiv | 2784589
+ pdf | pmc | 2439181
+ pdf | dblp | 631716
+ html | doi | 126699
+ xml | doaj | 23066
+ pdf | cnki_covid19 | 2034
+ pdf | spn | 1026
+ pdf | wanfang_covid19 | 975
+ html | spn | 65
+ xml | spn | 2
+ xml | doi | 1
+ (17 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 24742500
+ pdf | doi | fatcat-ingest | 15592121
+ pdf | doaj | doaj | 6484737
+ html | doaj | doaj | 3987468
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 799793
+ pdf | dblp | dblp | 631716
+ pdf | pmc | fatcat-ingest | 297980
+ html | doi | fatcat-ingest | 121508
+ pdf | pmc | fatcat-changelog | 112376
+ pdf | doaj | fatcat-changelog | 47181
+ xml | doaj | doaj | 23066
+ html | doi | fatcat-changelog | 5129
+ pdf | doaj | fatcat-ingest | 2423
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1814
+ pdf | spn | savepapernow-web | 1026
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ html | doaj | fatcat-ingest | 201
+ html | spn | savepapernow-web | 65
+ html | doi | savepapernow-web | 62
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 4
+ xml | spn | savepapernow-web | 2
+ xml | doi | savepapernow-web | 1
+ (30 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 167653
+ pdf | doaj | 81517
+ pdf | oai | 15282
+ html | doaj | 1791
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (6 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 16024068 | 0.313
+ pdf | unpaywall | 43932525 | 36045446 | 0.820
+ pdf | doi | 43852308 | 14956080 | 0.341
+ pdf | mag | 43701948 | 32768484 | 0.750
+ pdf | doaj | 6534341 | 4704066 | 0.720
+ html | doaj | 3987669 | 778165 | 0.195
+ pdf | arxiv | 2784589 | 2419941 | 0.869
+ pdf | pmc | 2439181 | 1897671 | 0.778
+ pdf | dblp | 631716 | 305142 | 0.483
+ html | doi | 126699 | 75754 | 0.598
+ xml | doaj | 23066 | 10381 | 0.450
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | spn | 1026 | 778 | 0.758
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ html | spn | 65 | 13 | 0.200
+ xml | spn | 2 | 1 | 0.500
+ xml | doi | 1 | 0 | 0.000
+ (17 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------+----------
+ pdf | success | 94887295
+ pdf | no-pdf-link | 33960080
+ pdf | no-capture | 20893916
+ pdf | terminal-bad-status | 6973765
+ pdf | redirect-loop | 5775175
+ pdf | link-loop | 4095424
+ pdf | skip-url-blocklist | 4037518
+ pdf | blocked-cookie | 3508762
+ html | wrong-scope | 1783694
+ pdf | wrong-mimetype | 1379673
+ html | success | 853762
+ pdf | gateway-timeout | 635170
+ html | no-capture | 381283
+ pdf | wayback-content-error | 356694
+ pdf | cdx-error | 347700
+ pdf | null-body | 336166
+ html | unknown-scope | 321874
+ html | html-resource-no-capture | 294294
+ pdf | forbidden | 291127
+ pdf | not-found | 274343
+ pdf | too-many-redirects | 264494
+ component | wrong-mimetype | 196680
+ component | spn2-cdx-lookup-failure | 173615
+ component | spn2-backoff | 115840
+ html | terminal-bad-status | 106264
+ html | null-body | 100296
+ pdf | wayback-error | 94748
+ html | blocked-cookie | 88537
+ component | no-capture | 75278
+ pdf | empty-blob | 61157
+ pdf | bad-redirect | 58680
+ pdf | skip-wall | 57751
+ pdf | spn2-error:too-many-redirects | 52873
+ html | spn2-backoff | 50577
+ pdf | remote-server-error | 41282
+ pdf | invalid-host-resolution | 38864
+ pdf | read-timeout | 37071
+ pdf | spn2-cdx-lookup-failure | 34229
+ html | wrong-mimetype | 33643
+ pdf | spn2-backoff | 32437
+ pdf | petabox-error | 31006
+ html | wayback-content-error | 28034
+ component | spn2-error | 27044
+ pdf | spn2-error:unknown | 25810
+ component | gateway-timeout | 25215
+ pdf | body-too-large | 21721
+ html | petabox-error | 18313
+ html | empty-blob | 14393
+ html | redirect-loop | 13404
+ component | blocked-cookie | 12287
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 45052391
+ pdf | | 26117481
+ pdf | 301 | 4814786
+ html | 200 | 2684821
+ pdf | 403 | 1871088
+ pdf | 404 | 1254259
+ pdf | 302 | 898728
+ pdf | 503 | 867548
+ pdf | 401 | 851205
+ pdf | 429 | 741869
+ pdf | 400 | 624519
+ component | | 456915
+ html | | 442051
+ pdf | 500 | 283700
+ component | 200 | 197510
+ pdf | 410 | 120647
+ pdf | 303 | 107947
+ html | 404 | 80114
+ pdf | 420 | 26722
+ pdf | 502 | 19500
+ pdf | 409 | 15499
+ html | 429 | 15208
+ pdf | 509 | 15167
+ pdf | 999 | 12186
+ pdf | 202 | 11535
+ html | 301 | 10213
+ xml | | 10018
+ pdf | 307 | 8657
+ pdf | 402 | 8338
+ pdf | 412 | 8064
+ pdf | 308 | 6479
+ html | 500 | 4746
+ xml | 200 | 2668
+ pdf | 520 | 2496
+ html | 302 | 2289
+ pdf | 521 | 2257
+ html | 202 | 2177
+ pdf | 206 | 1961
+ html | 403 | 1775
+ pdf | 504 | 1187
+ pdf | 421 | 1148
+ html | 303 | 1112
+ pdf | 406 | 1109
+ pdf | 204 | 772
+ pdf | 432 | 745
+ pdf | 405 | 633
+ html | 400 | 632
+ pdf | 426 | 515
+ pdf | 508 | 503
+ pdf | 505 | 469
+ (50 rows)
diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt
new file mode 100644
index 0000000..0a6254a
--- /dev/null
+++ b/sql/stats/2022-11-23_table_sizes.txt
@@ -0,0 +1,21 @@
+PostgreSQL 13.2 - wbgrp-svc506.us.archive.org
+Size: 1.13T
+
+ table_name | table_size | indexes_size | total_size
+------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 63 GB | 45 GB | 108 GB
+ "public"."ingest_request" | 53 GB | 52 GB | 105 GB
+ "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB
+ "public"."file_meta" | 39 GB | 40 GB | 79 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7879 MB | 68 MB | 7947 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes