aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitlab-ci.yml31
-rw-r--r--README.md60
-rw-r--r--TODO2
-rw-r--r--extra/RUNBOOK.md (renamed from RUNBOOK.md)4
-rw-r--r--extra/blobs/README.md (renamed from blobs/README.md)0
-rw-r--r--extra/blobs/minio/README.md (renamed from blobs/minio/README.md)0
-rw-r--r--extra/blobs/minio/minio.conf (renamed from blobs/minio/minio.conf)0
-rw-r--r--extra/blobs/seaweedfs/README.md (renamed from blobs/seaweedfs/README.md)0
-rw-r--r--extra/blobs/tasks.md (renamed from blobs/tasks.md)4
-rw-r--r--extra/hbase/howto.md (renamed from hbase/howto.md)0
-rw-r--r--extra/hbase/notes.txt (renamed from hbase/notes.txt)0
-rw-r--r--extra/hbase/schema_design.md (renamed from hbase/schema_design.md)0
-rw-r--r--extra/nginx/README.md (renamed from nginx/README.md)0
-rw-r--r--extra/nginx/fatcat-blobs (renamed from nginx/fatcat-blobs)0
-rw-r--r--extra/nginx/sandcrawler-db (renamed from nginx/sandcrawler-db)0
-rw-r--r--extra/nginx/sandcrawler-minio (renamed from nginx/sandcrawler-minio)0
-rw-r--r--kafka/howto_rebalance.md3
-rw-r--r--kafka/monitoring_commands.md4
-rw-r--r--kafka/topics.md14
-rw-r--r--notes/dryad_datasets.md17
-rw-r--r--notes/examples/2021-11-12_broken_grobid_xml.md83
-rw-r--r--notes/examples/dataset_examples.txt52
-rw-r--r--notes/examples/html_test_journals.txt153
-rw-r--r--notes/examples/random_datasets.md19
-rw-r--r--notes/ingest/2020-11-04_arxiv.md12
-rw-r--r--notes/ingest/2020-12-08_patch_crawl_notes.md111
-rw-r--r--notes/ingest/2021-04_unpaywall.md368
-rw-r--r--notes/ingest/2021-05_daily_improvements.md480
-rw-r--r--notes/ingest/2021-07_unpaywall.md320
-rw-r--r--notes/ingest/2021-08_mag.md400
-rw-r--r--notes/ingest/2021-09-02_oai_pmh_patch.md1578
-rw-r--r--notes/ingest/2021-09-03_daily_improvements.md1021
-rw-r--r--notes/ingest/2021-09-03_patch_crawl.md678
-rw-r--r--notes/ingest/2021-12-13_datasets.md504
-rw-r--r--notes/ingest/2022-01-06_patch_crawl.md398
-rw-r--r--notes/ingest/2022-01-13_doi_crawl.md248
-rw-r--r--notes/ingest/2022-03_doaj.md278
-rw-r--r--notes/ingest/2022-03_oaipmh.md40
-rw-r--r--notes/ingest/2022-04_targeted.md144
-rw-r--r--notes/ingest/2022-04_unpaywall.md278
-rw-r--r--notes/ingest/2022-07-15_ingest_fixes.md831
-rw-r--r--notes/ingest/2022-07-19_dblp.md50
-rw-r--r--notes/ingest/2022-07_doaj.md199
-rw-r--r--notes/ingest/2022-07_targeted.md140
-rw-r--r--notes/ingest/2022-09_oaipmh.md397
-rw-r--r--notes/ingest_domains.txt294
-rw-r--r--notes/possible_ingest_targets.txt15
-rw-r--r--notes/tasks/2020-10-21_pdfextract_holes.md74
-rw-r--r--notes/tasks/2021-09-09_pdf_url_lists.md70
-rw-r--r--notes/tasks/2021-10-29_crossref_refs_backfill.md235
-rw-r--r--notes/tasks/2021-12-06_regrobid.md380
-rw-r--r--notes/tasks/2022-01-07_grobid_platform_pdfs.md23
-rw-r--r--notes/tasks/2022-03-07_ukraine_firedrill.md225
-rw-r--r--notes/tasks/2022-04-27_pdf_url_lists.md72
-rw-r--r--notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md132
-rwxr-xr-xplease24
-rw-r--r--proposals/2018_original_sandcrawler_rfc.md (renamed from sandcrawler-rfc.md)2
-rw-r--r--proposals/2019_ingest.md6
-rw-r--r--proposals/20200129_pdf_ingest.md10
-rw-r--r--proposals/20200207_pdftrio.md5
-rw-r--r--proposals/20201012_no_capture.md7
-rw-r--r--proposals/20201103_xml_ingest.md21
-rw-r--r--proposals/2020_pdf_meta_thumbnails.md4
-rw-r--r--proposals/2020_seaweed_s3.md2
-rw-r--r--proposals/2021-04-22_crossref_db.md86
-rw-r--r--proposals/2021-09-09_component_ingest.md114
-rw-r--r--proposals/2021-09-09_fileset_ingest.md343
-rw-r--r--proposals/2021-09-13_src_ingest.md53
-rw-r--r--proposals/2021-09-21_spn_accounts.md14
-rw-r--r--proposals/2021-10-28_grobid_refs.md125
-rw-r--r--proposals/2021-12-09_trawling.md180
-rw-r--r--proposals/brainstorm/2021-debug_web_interface.md9
-rw-r--r--proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md36
-rw-r--r--python/.coveragerc1
-rw-r--r--python/.flake821
-rw-r--r--python/.gitignore12
-rw-r--r--python/Makefile8
-rw-r--r--python/Pipfile29
-rw-r--r--python/Pipfile.lock2247
-rw-r--r--python/README.md46
-rw-r--r--python/TODO7
-rw-r--r--python/example.env1
-rwxr-xr-xpython/grobid2json.py215
-rwxr-xr-xpython/grobid_tool.py149
-rwxr-xr-xpython/ia_pdf_match.py93
-rwxr-xr-xpython/ingest_file.py100
-rwxr-xr-xpython/ingest_tool.py244
-rwxr-xr-xpython/pdfextract_tool.py110
-rwxr-xr-xpython/pdftrio_tool.py110
-rwxr-xr-xpython/persist_tool.py234
-rw-r--r--python/pyproject.toml7
-rw-r--r--python/pytest.ini5
-rw-r--r--python/sandcrawler/__init__.py59
-rw-r--r--python/sandcrawler/db.py549
-rw-r--r--python/sandcrawler/fileset_platforms.py832
-rw-r--r--python/sandcrawler/fileset_strategies.py387
-rw-r--r--python/sandcrawler/fileset_types.py74
-rw-r--r--python/sandcrawler/grobid.py358
-rw-r--r--python/sandcrawler/html.py307
-rw-r--r--python/sandcrawler/html_metadata.py615
-rw-r--r--python/sandcrawler/ia.py941
-rw-r--r--python/sandcrawler/ingest.py754
-rw-r--r--python/sandcrawler/ingest_file.py925
-rw-r--r--python/sandcrawler/ingest_fileset.py516
-rw-r--r--python/sandcrawler/ingest_html.py (renamed from python/sandcrawler/html_ingest.py)218
-rw-r--r--python/sandcrawler/minio.py45
-rw-r--r--python/sandcrawler/misc.py205
-rw-r--r--python/sandcrawler/pdfextract.py219
-rw-r--r--python/sandcrawler/pdftrio.py100
-rw-r--r--python/sandcrawler/persist.py701
-rw-r--r--python/sandcrawler/workers.py418
-rw-r--r--python/sandcrawler/xml.py1
-rwxr-xr-xpython/sandcrawler_worker.py277
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py70
-rwxr-xr-xpython/scripts/archiveorg_fileset.py135
-rwxr-xr-xpython/scripts/cdx_collection.py82
-rwxr-xr-xpython/scripts/covid2ingestrequest.py79
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py94
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py193
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py182
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py87
-rwxr-xr-xpython/scripts/enrich_scored_matches.py19
-rwxr-xr-xpython/scripts/fetch_cdx_sha1hex.py170
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py111
-rwxr-xr-xpython/scripts/filter_groupworks.py48
-rwxr-xr-xpython/scripts/filter_scored_matches.py49
-rwxr-xr-xpython/scripts/grobid_affiliations.py37
-rwxr-xr-xpython/scripts/import_grobid_metadata.py69
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py36
-rwxr-xr-xpython/scripts/manifest_converter.py7
-rwxr-xr-xpython/scripts/oai2ingestrequest.py112
-rwxr-xr-xpython/scripts/pdf_thumbnail.py15
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py80
-rw-r--r--python/tests/files/crossref_api_work_978-3-030-64953-1_4.json1
-rw-r--r--python/tests/files/crossref_api_work_s1047951103000064.json1
-rw-r--r--python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml66
-rw-r--r--python/tests/files/grobid_refs_s1047951103000064.tei.xml499
-rw-r--r--python/tests/files/small.json7
-rw-r--r--python/tests/test_grobid.py199
-rw-r--r--python/tests/test_grobid2json.py26
-rw-r--r--python/tests/test_html.py28
-rw-r--r--python/tests/test_html_ingest.py10
-rw-r--r--python/tests/test_html_metadata.py106
-rw-r--r--python/tests/test_ingest.py257
-rw-r--r--python/tests/test_live_wayback.py54
-rw-r--r--python/tests/test_misc.py99
-rw-r--r--python/tests/test_pdfextract.py51
-rw-r--r--python/tests/test_pushers.py33
-rw-r--r--python/tests/test_savepagenow.py265
-rw-r--r--python/tests/test_wayback.py195
-rw-r--r--python/tests/test_xml.py5
-rw-r--r--python_hadoop/README.md8
-rw-r--r--scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala187
-rw-r--r--scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala175
-rw-r--r--sql/Makefile35
-rw-r--r--sql/README.md26
-rw-r--r--sql/backfill/backfill.md13
-rw-r--r--sql/dump_file_meta.sql2
-rw-r--r--sql/dump_regrobid_pdf_petabox.sql2
-rw-r--r--sql/dump_reingest_bulk.sql31
-rw-r--r--sql/dump_reingest_old.sql36
-rw-r--r--sql/dump_reingest_quarterly.sql105
-rw-r--r--sql/dump_reingest_spn.sql36
-rw-r--r--sql/dump_reingest_terminalstatus.sql34
-rw-r--r--sql/dump_reingest_weekly.sql100
-rw-r--r--sql/dump_unextracted_pdf.sql2
-rw-r--r--sql/dump_unextracted_pdf_petabox.sql2
-rw-r--r--sql/dump_ungrobid_pdf.sql2
-rw-r--r--sql/dump_ungrobid_pdf_petabox.sql2
-rw-r--r--sql/dump_unmatched_glutton_pdf.sql2
-rw-r--r--sql/ingest_again.md28
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql72
-rw-r--r--sql/monitoring_queries.md32
-rwxr-xr-xsql/reingest_bulk.sh19
-rwxr-xr-xsql/reingest_old.sh19
-rwxr-xr-xsql/reingest_quarterly.sh17
-rwxr-xr-xsql/reingest_spn.sh19
-rwxr-xr-xsql/reingest_terminalstatus_forcerecrawl.sh19
-rwxr-xr-xsql/reingest_weekly.sh17
-rw-r--r--sql/stats/2021-04-07_stats.txt430
-rw-r--r--sql/stats/2021-04-08_table_sizes.txt40
-rw-r--r--sql/stats/2021-04-12_ingest_domain_summary_30d.txt345
-rw-r--r--sql/stats/2021-11-01_table_sizes.txt19
-rw-r--r--sql/stats/2021-11-26_stats.txt424
-rw-r--r--sql/stats/2021-12-02_table_sizes.txt22
-rw-r--r--sql/stats/2022-04-26_stats.txt432
-rw-r--r--sql/stats/2022-04-27_crawl_changelog.txt191
-rw-r--r--sql/stats/2022-05-11_crawl_changelog.txt410
-rw-r--r--sql/stats/2022-09-06_stats.txt438
-rw-r--r--sql/stats/2022-11-23_table_sizes.txt21
-rw-r--r--sql/stats/README.md13
191 files changed, 26033 insertions, 4993 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8d83b97..457a250 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,33 +1,30 @@
-image: ubuntu:xenial
-before_script:
- - apt update -qy
- - apt install -y --no-install-recommends apt-transport-https software-properties-common
- - add-apt-repository -y ppa:deadsnakes/ppa
- - add-apt-repository -y ppa:cran/poppler
- # kitware (cmake) APT signing key
- - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 6d903995424a83a48d42d53da8e5ef3a02600268
- - apt-add-repository -y 'deb https://apt.kitware.com/ubuntu/ xenial main'
- # SBT bintray APT signing key
- - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
- - apt-add-repository -y "deb https://dl.bintray.com/sbt/debian /"
- - apt update -qy
- - apt install -y --no-install-recommends python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt libpq-dev python-dev python3.8 python3.8-dev python3.8-venv python3.8-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev libpoppler-cpp-dev cmake libpython3.8-dev build-essential poppler-data libmagic1
- - python3 -m pip install --upgrade pip
- - python3 -m pip install pipenv
- - pipenv --version
+
+image: ubuntu:focal
variables:
LC_ALL: "C.UTF-8"
LANG: "C.UTF-8"
+ DEBIAN_FRONTEND: "noninteractive"
+
+before_script:
+ - apt update -qy
+ - apt install -y --no-install-recommends apt-transport-https software-properties-common curl dirmngr gpg-agent
+ # scala-sbt.org APT signing key
+ - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0x2EE0EA64E40A89B84B2DF73499E82A75642AC823
+ - apt-add-repository -y "deb https://repo.scala-sbt.org/scalasbt/debian all main"
+ - apt install -y --no-install-recommends python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt libpq-dev python-dev python3.8 python3.8-dev python3.8-venv python3.8-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev libpoppler-cpp-dev cmake libpython3.8-dev build-essential poppler-data libmagic1 pipenv wget
+ - pipenv --version
test_python:
script:
- cd python
+ - cp example.env .env
- pipenv install --dev --deploy
- make coverage
- make lint
test_python_hadoop:
+ when: manual
script:
- cd python_hadoop
- pipenv install --dev --deploy
diff --git a/README.md b/README.md
index afe1ff6..b29e397 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,25 @@
\ooooooo| |___/\__,_|_| |_|\__,_|\___|_| \__,_| \_/\_/ |_|\___|_|
-This repo contains back-end python workers, scripts, hadoop jobs, luigi tasks,
-and other scripts and code for the Internet Archive web group's journal ingest
-pipeline. This code is of mixed quality and is mostly experimental. The goal
-for most of this is to submit metadata to [fatcat](https://fatcat.wiki), which
-is the more stable, maintained, and public-facing service.
-
-Code in this repository is potentially public! Not intented to accept public
-contributions for the most part. Much of this will not work outside the IA
-cluster environment.
+This repo contains back-end python workers, scripts, config files, and other
+stuff related to the Internet Archive web group's scholarly web preservation
+and processing pipeline. It is a complement to [fatcat](https://fatcat.wiki),
+which is an open catalog of research outputs, including preservation metadata.
+
+The sandcrawler part of the project deals with content crawled from the web
+into either web.archive.org or archive.org collections, and post-processing
+that content. For example, extracting text from PDF files, verifying mimetypes,
+and checking archival status. The resulting metadata ends up getting filtered,
+transformed, and pushed in to fatcat itself for public use.
+
+While code in this repository is public, it is mostly IA-specific and may not
+even run outside the IA data centers due to library dependencies and
+authentication needs. Code quality and documentation is generally poor compared
+to fatcat.
+
+As of December 2022, the best document to read for "getting started" in
+understanding the ingest system is `proposals/2019_ingest.md`, and then
+subsequent proposals expanding on that foundation.
Archive-specific deployment/production guides and ansible scripts at:
[journal-infra](https://git.archive.org/webgroup/journal-infra)
@@ -22,33 +32,35 @@ Archive-specific deployment/production guides and ansible scripts at:
## Repository Layout
-**./proposals/** design documentation and change proposals
-
**./python/** contains scripts and utilities for ingesting content from wayback
-and/or the web (via save-page-now API), and other processing pipelines
+and/or the web (via save-page-now API), and other processing pipelines. Most of
+the active code is in here. See included README (`./python/README.md`)
**./sql/** contains schema, queries, and backfill scripts for a Postgres SQL
database index (eg, file metadata, CDX, and GROBID status tables).
-**./pig/** contains a handful of Pig scripts, as well as some unittests
-implemented in python. Only rarely used.
+**./python_hadoop/** contains Hadoop streaming jobs written in python using the
+`mrjob` library. Still use the HBase backfill code path occasionally.
-**./scalding/** contains Hadoop jobs written in Scala using the Scalding
-framework. The intent is to write new non-trivial Hadoop jobs in Scala, which
-brings type safety and compiled performance. Mostly DEPRECATED.
+**./proposals/** design documentation and change proposals
-**./python_hadoop/** contains Hadoop streaming jobs written in python using the
-`mrjob` library. Mostly DEPRECATED.
+**./notes/ingest/** log of bulk crawls and metadata loads
+**./extra/docker/** docker-compose setup that may be useful for documentation
+(includes Kafka, PostgreSQL, etc)
-## Running Python Code
+**./.gitlab-ci.yml** current CI setup script, which documents dependencies
-You need python3.8 (or python3.6+ and `pyenv`) and `pipenv` to set up the
-environment. You may also need the debian packages `libpq-dev` and `
-`python-dev` to install some dependencies.
+**./pig/** contains a handful of Pig scripts, as well as some unittests
+implemented in python. Only rarely used.
+
+**./scalding/** contains Hadoop jobs written in Scala using the Scalding
+framework. The intent is to write new non-trivial Hadoop jobs in Scala, which
+brings type safety and compiled performance. Mostly DEPRECATED, this code has
+not been run in years.
-## Running Hadoop Jobs (DEPRECATED)
+## Running Python Hadoop Jobs
The `./please` python3 wrapper script is a helper for running jobs (python or
scalding) on the IA Hadoop cluster. You'll need to run the setup/dependency
diff --git a/TODO b/TODO
index 77b48c9..33dc147 100644
--- a/TODO
+++ b/TODO
@@ -1,4 +1,6 @@
+Note: as of 2022 this file is ancient and need review
+
## Kafka Pipelines
- after network split, mass restarting import/harvest stuff seemed to
diff --git a/RUNBOOK.md b/extra/RUNBOOK.md
index 33d4711..6c4165d 100644
--- a/RUNBOOK.md
+++ b/extra/RUNBOOK.md
@@ -23,7 +23,7 @@ Copy/transfer to a Kafka node; load a sample and then the whole output:
Older example; if this fails, need to re-run entire thing:
- cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
TODO: is it possible to use job log with millions of `--pipe` inputs? That
would be more efficient in the event of failure.
@@ -35,7 +35,7 @@ Want to use GNU/Parallel in a mode that will do retries well:
fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
sort | \
parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
- './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
After starting, check that messages are actually getting pushed to kafka
(producer failures can be silent!). If anything goes wrong, run the exact same
diff --git a/blobs/README.md b/extra/blobs/README.md
index 555db92..555db92 100644
--- a/blobs/README.md
+++ b/extra/blobs/README.md
diff --git a/blobs/minio/README.md b/extra/blobs/minio/README.md
index d8f1c69..d8f1c69 100644
--- a/blobs/minio/README.md
+++ b/extra/blobs/minio/README.md
diff --git a/blobs/minio/minio.conf b/extra/blobs/minio/minio.conf
index 2e93f9a..2e93f9a 100644
--- a/blobs/minio/minio.conf
+++ b/extra/blobs/minio/minio.conf
diff --git a/blobs/seaweedfs/README.md b/extra/blobs/seaweedfs/README.md
index d19e9e0..d19e9e0 100644
--- a/blobs/seaweedfs/README.md
+++ b/extra/blobs/seaweedfs/README.md
diff --git a/blobs/tasks.md b/extra/blobs/tasks.md
index 34dec8f..beb765f 100644
--- a/blobs/tasks.md
+++ b/extra/blobs/tasks.md
@@ -19,7 +19,7 @@ didn't try to connect to postgresql.
Commands:
- ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
=> Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed
=> run briefly, then kill
@@ -29,7 +29,7 @@ On kafka-broker worker:
Then run 2x instances of worker (same command as above):
- ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
At this point CPU-limited on this worker by the python processes (only 4 cores
on this machine).
diff --git a/hbase/howto.md b/extra/hbase/howto.md
index 26d33f4..26d33f4 100644
--- a/hbase/howto.md
+++ b/extra/hbase/howto.md
diff --git a/hbase/notes.txt b/extra/hbase/notes.txt
index 20f406f..20f406f 100644
--- a/hbase/notes.txt
+++ b/extra/hbase/notes.txt
diff --git a/hbase/schema_design.md b/extra/hbase/schema_design.md
index 2db8998..2db8998 100644
--- a/hbase/schema_design.md
+++ b/extra/hbase/schema_design.md
diff --git a/nginx/README.md b/extra/nginx/README.md
index 0369f9b..0369f9b 100644
--- a/nginx/README.md
+++ b/extra/nginx/README.md
diff --git a/nginx/fatcat-blobs b/extra/nginx/fatcat-blobs
index 5c692ef..5c692ef 100644
--- a/nginx/fatcat-blobs
+++ b/extra/nginx/fatcat-blobs
diff --git a/nginx/sandcrawler-db b/extra/nginx/sandcrawler-db
index 67d1a2d..67d1a2d 100644
--- a/nginx/sandcrawler-db
+++ b/extra/nginx/sandcrawler-db
diff --git a/nginx/sandcrawler-minio b/extra/nginx/sandcrawler-minio
index 2e9bfe3..2e9bfe3 100644
--- a/nginx/sandcrawler-minio
+++ b/extra/nginx/sandcrawler-minio
diff --git a/kafka/howto_rebalance.md b/kafka/howto_rebalance.md
index d68b205..093740a 100644
--- a/kafka/howto_rebalance.md
+++ b/kafka/howto_rebalance.md
@@ -27,7 +27,8 @@ On a kafka broker, go to `/srv/kafka-broker/kafka-*/bin`, generate a plan, then
inspect the output:
./kafka-reassign-partitions.sh --zookeeper localhost:2181 --broker-list "280,281,284,285,263" --topics-to-move-json-file /tmp/topics_to_move.json --generate > /tmp/reassignment-plan.json
- cat /tmp/reassignment-plan.json | rg '^\{' | tail -n1 > /tmp/new-plan.json
+ cat /tmp/reassignment-plan.json | rg '^\{' | head -n1 | jq . > /tmp/old-plan.json
+ cat /tmp/reassignment-plan.json | rg '^\{' | tail -n1 | jq . > /tmp/new-plan.json
cat /tmp/reassignment-plan.json | rg '^\{' | jq .
If that looks good, start the rebalance:
diff --git a/kafka/monitoring_commands.md b/kafka/monitoring_commands.md
new file mode 100644
index 0000000..c0c330f
--- /dev/null
+++ b/kafka/monitoring_commands.md
@@ -0,0 +1,4 @@
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.status, .base_url]' -c
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.request.ingest_request_source, .status, .request.base_url, .terminal.terminal_url]' -c
diff --git a/kafka/topics.md b/kafka/topics.md
index 06faf8e..a699e16 100644
--- a/kafka/topics.md
+++ b/kafka/topics.md
@@ -25,7 +25,8 @@ retention (on both a size and time basis).
=> fewer partitions with batch mode, but still a bunch (24?)
=> key is sha1hex of PDF. enable time compaction (6 months?)
- sandcrawler-ENV.ingest-file-requests
+ sandcrawler-ENV.ingest-file-requests-daily
+ => was ingest-file-requests previously, but renamed/rebalanced
=> ingest requests from multiple sources; mostly continuous or pseudo-interactive
=> schema is JSON; see ingest proposal for fields. small objects.
=> fewer partitions with batch mode, but still a bunch (24)
@@ -35,6 +36,10 @@ retention (on both a size and time basis).
=> ingest requests from bulk crawl sources; background processing
=> same as ingest-file-requests
+ sandcrawler-ENV.ingest-file-requests-priority
+ => ingest requests from bulk crawl sources; background processing
+ => same as ingest-file-requests
+
sandcrawler-ENV.ingest-file-results
=> ingest requests from multiple sources
=> schema is JSON; see ingest proposal for fields. small objects.
@@ -113,9 +118,6 @@ retention (on both a size and time basis).
=> v03 is newer v0.3.0 API schema (backwards incompatible)
=> key: fcid
=> 8x partitions
- fatcat-ENV.work-updates
- => key: fcid
- => 8x partitions
fatcat-ENV.container-updates
=> key: fcid
=> 4x partitions
@@ -174,15 +176,15 @@ exists`; this seems safe, and the settings won't be over-ridden.
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ungrobided-pg
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.grobid-output-pg --config compression.type=gzip --config cleanup.policy=compact
- ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ingest-file-requests --config retention.ms=7889400000 --config cleanup.policy=delete
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ingest-file-requests-daily --config retention.ms=7889400000 --config cleanup.policy=delete
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-requests-priority --config retention.ms=7889400000 --config cleanup.policy=delete
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-results
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03
- ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.file-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.container-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic fatcat-qa.work-ident-updates
diff --git a/notes/dryad_datasets.md b/notes/dryad_datasets.md
new file mode 100644
index 0000000..5c727b1
--- /dev/null
+++ b/notes/dryad_datasets.md
@@ -0,0 +1,17 @@
+
+api docs: https://datadryad.org/api/v2/docs
+
+current search queries return 38,000 hits (December 2020)
+
+exmaple with multiple versions:
+ https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0
+ https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0
+ https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions
+
+
+how to handle versions? DOI doesn't get incremented.
+
+on archive.org, could have separate item for each version, or sub-directories within item, one for each version
+
+in fatcat, could have a release for each version, but only one with
+the DOI; or could have a separate fileset for each version
diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md
new file mode 100644
index 0000000..5223651
--- /dev/null
+++ b/notes/examples/2021-11-12_broken_grobid_xml.md
@@ -0,0 +1,83 @@
+
+Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others):
+
+ sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100;
+
+ sha1hex | updated | grobid_version | status_code | status | fatcat_release | metadata
+ ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------
+ d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 | | 200 | error | | {"error_msg": "response XML too large: 12052192 bytes"}
+ 8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 | | 200 | error | | {"error_msg": "response XML too large: 18758248 bytes"}
+ 227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf
+ FIXED
+ f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 527"}
+ https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf
+ FIXED
+ c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 | | 200 | bad-grobid-xml | | {"error_msg": "mismatched tag: line 198, column 3"}
+ https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf
+ FIXED (and good)
+ 4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 | | 200 | bad-grobid-xml | | {"error_msg": "unclosed token: line 812, column 7"}
+ https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf
+ FIXED
+ metadata quality mixed, but complex document (?)
+ 7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 38, column 440"}
+ https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23
+ FIXED
+ 088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 47, column 814"}
+ https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf
+ FIXED
+ 19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 853, column 84"}
+ not found
+ acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 60, column 45"}
+ https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf
+ BROKEN: not well-formed (invalid token): line 60, column 45
+ <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note>
+ 8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 44, column 45"}
+ not found
+ c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 58, column 45"}
+ https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308
+ BROKEN: not well-formed (invalid token): line 58, column 45
+ <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, &amp; Bian, 2020).</note>
+ 840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 1824, column 45"}
+ not found
+ 3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 65, column 45"}
+ not found
+ f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 29, column 1581"}
+ https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649
+ FIXED, good
+ f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf
+ FIXED
+ 37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 1284"}
+ https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf
+ FIXED
+ 3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1
+ FIXED
+ (21 rows)
+
+Some other errors from other queries:
+
+ d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"}
+ https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf
+ FIXED: with 0.7.0+
+
+ 56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 | | 500 | error | | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"}
+ https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf
+ still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500
+ BAD PDF ("no pages" in evince)
+
+ d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"}
+ https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf
+ FIXED
+
+ 51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00 | | 503 | error | | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t
+ https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf
+ FIXED
+
+In summary, there are still a small number of `bad-grobid-xml` cases, and still
+many "very large PDF" cases. But we should probably broadly retry everything,
+especially the 503 errors (from when GROBID is simply down/unavailable).
+
+The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations,
+which I have submitted a patch/PR for.
diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt
new file mode 100644
index 0000000..3a04750
--- /dev/null
+++ b/notes/examples/dataset_examples.txt
@@ -0,0 +1,52 @@
+
+### ArchiveOrg: CAT dataset
+
+<https://archive.org/details/CAT_DATASET>
+
+`release_36vy7s5gtba67fmyxlmijpsaui`
+
+###
+
+<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635>
+
+doi:10.1371/journal.pone.0120448
+
+Single .rar file
+
+### Dataverse
+
+<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B>
+
+Single excel file
+
+### Dataverse
+
+<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+doi:10.7910/DVN/CLSFKX
+
+Mulitple files; multiple versions?
+
+API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+ .data.id
+ .data.latestVersion.datasetPersistentId
+ .data.latestVersion.versionNumber, .versionMinorNumber
+ .data.latestVersion.files[]
+ .dataFile
+ .contentType (mimetype)
+ .filename
+ .filesize (int, bytes)
+ .md5
+ .persistendId
+ .description
+ .label (filename?)
+ .version
+
+Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB>
+
+Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3)
+
+Dataverse refs:
+- 'doi' and 'hdl' are the two persistentId styles
+- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled
diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt
new file mode 100644
index 0000000..540dc9f
--- /dev/null
+++ b/notes/examples/html_test_journals.txt
@@ -0,0 +1,153 @@
+
+Good examples of journals to run HTML fulltext extraction on.
+
+## Live Web
+
+d-lib magazine
+ live web
+ no longer active
+ http://www.dlib.org/back.html
+
+NLM technical bulletin
+ https://www.nlm.nih.gov/pubs/techbull/back_issues.html
+
+Genders
+ https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html
+
+firstmondays
+ live web; now OJS
+
+outhistory.org
+
+http://journal.sjdm.org/
+
+http://whoosh.org/
+
+
+## Vanished (but wayback coverage)
+
+ohmylittledata
+ issn:2551-1289
+ vanished
+ blog format
+ http://web.archive.org/web/20180421061156/https://ohmylittledata.com/
+
+exquisit corpse
+ https://web.archive.org/web/20080521052400/http://corpse.org:80/
+
+Journal of Mundane Behavior
+ https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya
+ ISSN: 1529-3041
+
+ defunct since ~2010
+ simple HTML articles
+ references
+ http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm
+ http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm
+
+War Crimes
+
+ PDF articles (not HTML)
+ http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/
+
+
+## DOAJ Test Articles (HTML)
+
+ zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt
+ => 2,184,954
+
+ cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25
+ 254817 link.springer.com
+ 145159 www.scielo.br
+ 78044 journal.frontiersin.org
+ 77394 www.frontiersin.org
+ 40849 www.dovepress.com
+ 19024 dergipark.org.tr
+ 18758 periodicos.ufsc.br
+ 16346 www.revistas.usp.br
+ 15872 revistas.unal.edu.co
+ 15527 revistas.ucm.es
+ 13669 revistas.usal.es
+ 12640 dergipark.gov.tr
+ 12111 journals.rudn.ru
+ 11839 www.scielosp.org
+ 11277 www.karger.com
+ 10827 www.journals.vu.lt
+ 10318
+ 9854 peerj.com
+ 9100 ojs.unud.ac.id
+ 8581 jurnal.ugm.ac.id
+ 8261 riviste.unimi.it
+ 8012 journals.uran.ua
+ 7454 revistas.pucp.edu.pe
+ 7264 journals.vgtu.lt
+ 7200 publicaciones.banrepcultural.org
+
+ cat html_fulltext_urls.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.filtered.txt
+ => 1,579,257
+
+ zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt
+ => 560k
+
+ cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25
+ 40849 www.dovepress.com
+ 10570 journals.rudn.ru
+ 10494 dergipark.org.tr
+ 10233 revistas.unal.edu.co
+ 9981 dergipark.gov.tr
+ 9428 revistas.usal.es
+ 8292 revistas.ucm.es
+ 7200 publicaciones.banrepcultural.org
+ 6953 revistas.pucp.edu.pe
+ 6000 www.scielosp.org
+ 5962 www.scielo.br
+ 5621 www.richtmann.org
+ 5123 scielo.sld.cu
+ 5067 ojs.unud.ac.id
+ 4838 periodicos.ufsc.br
+ 4736 revistasonlinepre.inap.es
+ 4486 journal.fi
+ 4221 www.seer.ufu.br
+ 3553 revistas.uam.es
+ 3492 revistas.pucsp.br
+ 3060 www.scielo.org.co
+ 2991 scielo.isciii.es
+ 2802 seer.ufrgs.br
+ 2692 revistas.unc.edu.ar
+ 2685 srl.si
+
+ cat html_fulltext_urls.no_doi.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.no_doi.filtered.txt
+ => 518,608
+
+ zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20
+ https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795
+ https://journal.umy.ac.id/index.php/st/article/view/3297
+ https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442
+ http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf
+ http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440
+ https://journal.fi/inf/article/view/59430
+ http://journal.uii.ac.id/index.php/Eksakta/article/view/2429
+ https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS
+ https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157
+ http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce
+ http://revistas.pucp.edu.pe/index.php/themis/article/view/11862
+ http://journal.bdfish.org/index.php/fisheries/article/view/91
+ https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567
+ https://www.lithosphere.ru/jour/article/view/779
+ https://journals.hioa.no/index.php/seminar/article/view/2412
+ http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197
+ https://www.kmuj.kmu.edu.pk/article/view/15698
+ http://forodeeducacion.com/ojs/index.php/fde/article/view/82
+ https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941
+ http://grbs.library.duke.edu/article/view/3361
+
diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md
new file mode 100644
index 0000000..b69132c
--- /dev/null
+++ b/notes/examples/random_datasets.md
@@ -0,0 +1,19 @@
+
+Possible external datasets to ingest (which are not entire platforms):
+
+- https://research.google/tools/datasets/
+- https://openslr.org/index.html
+- https://www.kaggle.com/datasets?sort=votes&tasks=true
+- https://archive.ics.uci.edu/ml/datasets.php
+
+Existing archive.org datasets to ingest:
+
+- https://archive.org/details/allthemusicllc-datasets
+
+Papers on archive.org to ingest:
+
+- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=>
+- <https://archive.org/details/biorxiv>
+- <https://archive.org/details/philosophicaltransactions?tab=collection>
+- <https://archive.org/search.php?query=doi%3A%2A>
+- <https://archive.org/details/folkscanomy_academic>
diff --git a/notes/ingest/2020-11-04_arxiv.md b/notes/ingest/2020-11-04_arxiv.md
new file mode 100644
index 0000000..f9abe09
--- /dev/null
+++ b/notes/ingest/2020-11-04_arxiv.md
@@ -0,0 +1,12 @@
+
+Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run
+a crawl.
+
+Crawl is now done, so going to ingest, hoping to get the majority of the
+millions of remaining arxiv.org PDFs.
+
+ zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l
+ => 1,288,559
+
+ zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-12-08_patch_crawl_notes.md b/notes/ingest/2020-12-08_patch_crawl_notes.md
new file mode 100644
index 0000000..5979753
--- /dev/null
+++ b/notes/ingest/2020-12-08_patch_crawl_notes.md
@@ -0,0 +1,111 @@
+
+Notes here about re-ingesting or re-crawling large batches. Goal around end of
+2020 is to generate a broad patch crawl of terminal no-capture attempts for all
+major sources crawled thus far. Have already tried run this process for unpaywall.
+
+For each, want filtered ingest request JSON objects (filtering out platforms
+that don't crawl well, and possibly things like figshare+zenodo), and a broader
+seedlist (including terminal URLs). Will de-dupe all the seedlist URLs and do a
+heritrix crawl with new config, then re-ingest all the requests individually.
+
+Summary of what to do here:
+
+ OA DOI: expecting some 2.4 million seeds
+ OAI-PMH: expecting some 5 million no-capture URLs, plus more from missing PDF URL not found
+ Unpaywall: another ~900k no-capture URLs (maybe filtered?)
+
+For all, re-attempt for these status codes:
+
+ no-capture
+ cdx-error
+ wayback-error
+ petabox-error
+ gateway-timeout (?)
+
+And at least do bulk re-ingest for these, if updated before 2020-11-20 or so:
+
+ no-pdf-link
+
+## OAI-PMH
+
+Need to re-ingest all of the (many!) no-capture and no-pdf-link
+
+TODO: repec-specific URL extraction?
+
+Skip these OAI prefixes:
+
+ kb.dk
+ bnf.fr
+ hispana.mcu.es
+ bdr.oai.bsb-muenchen.de
+ ukm.si
+ hsp.org
+
+Skip these domains:
+
+ www.kb.dk (kb.dk)
+ kb-images.kb.dk (kb.dk)
+ mdz-nbn-resolving.de (TODO: what prefix?)
+ aggr.ukm.um.si (ukm.si)
+
+Check PDF link extraction for these prefixes, or skip them (TODO):
+
+ repec (mixed success)
+ biodiversitylibrary.org
+ juser.fz-juelich.de
+ americanae.aecid.es
+ www.irgrid.ac.cn
+ hal
+ espace.library.uq.edu.au
+ igi.indrastra.com
+ invenio.nusl.cz
+ hypotheses.org
+ t2r2.star.titech.ac.jp
+ quod.lib.umich.edu
+
+ domain: hemerotecadigital.bne.es
+ domain: bib-pubdb1.desy.de
+ domain: publikationen.bibliothek.kit.edu
+ domain: edoc.mpg.de
+ domain: bibliotecadigital.jcyl.es
+ domain: lup.lub.lu.se
+ domain: orbi.uliege.be
+
+TODO:
+- consider deleting ingest requests from skipped prefixes (large database use)
+
+
+## Unpaywall
+
+About 900k `no-pdf-link`, and up to 2.5 million more `no-pdf-link`.
+
+Re-bulk-ingest filtered requests which hit `no-pdf-link` before 2020-11-20:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) < '2020-11-20'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ ) TO '/grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json';
+ => COPY 1309990
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_nopdflink_2020-12-08.ingest_request.json
+ => 1.31M 0:00:51 [25.6k/s]
+
+ cat /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2021-04_unpaywall.md b/notes/ingest/2021-04_unpaywall.md
new file mode 100644
index 0000000..d7643f4
--- /dev/null
+++ b/notes/ingest/2021-04_unpaywall.md
@@ -0,0 +1,368 @@
+
+New snapshot released 2021-02-18, finally getting around to a crawl two months
+later.
+
+Intend to do same style of crawl as in the past. One change is that
+sandcrawler-db has moved to a focal VM.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18T160139.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json
+ => 30.0M 3:14:59 [2.57k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 30027007, 'insert-requests': 2703999, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 30027007, 'pushed': 30027007})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json';
+ => COPY 3277484
+
+ # previous, 2020-10 run: COPY 4216339
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json
+ => 3.28M 0:01:42 [32.1k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 26385866
+ no-pdf-link | 2132565
+ no-capture | 2092111
+ redirect-loop | 1732543
+ terminal-bad-status | 1504555
+ wayback-content-error | 357345
+ wrong-mimetype | 126070
+ link-loop | 76808
+ cdx-error | 22756
+ null-body | 22066
+ wayback-error | 13768
+ gateway-timeout | 3804
+ petabox-error | 3608
+ spn2-cdx-lookup-failure | 1225
+ redirects-exceeded | 892
+ invalid-host-resolution | 505
+ bad-redirect | 151
+ spn2-error | 108
+ spn2-error:job-failed | 91
+ bad-gzip-encoding | 27
+ (20 rows)
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-01-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 1348623
+ no-capture | 1231582
+ redirect-loop | 45622
+ no-pdf-link | 37312
+ terminal-bad-status | 24162
+ wrong-mimetype | 6684
+ link-loop | 5757
+ null-body | 1288
+ wayback-content-error | 1123
+ cdx-error | 831
+ petabox-error | 697
+ wayback-error | 185
+ invalid-host-resolution | 41
+ gateway-timeout | 29
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ spn2-cdx-lookup-failure | 7
+ bad-redirect | 4
+ timeout | 3
+ redirects-exceeded | 3
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json';
+ => 2020-10: 2,936,404
+ => 2021-04: 1,805,192
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json
+ => 1.81M 0:01:27 [20.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.*.txt
+ 6 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+ 1668524 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ 1685717 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1,804,211 consumer group lag
+
+## Post-Ingest Stats
+
+Overall status (unpaywall, all time):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 27242251
+ no-pdf-link | 2746237
+ redirect-loop | 1821132
+ terminal-bad-status | 1553441
+ no-capture | 478559
+ wayback-content-error | 357390
+ wrong-mimetype | 127365
+ link-loop | 79389
+ cdx-error | 23170
+ null-body | 23169
+ wayback-error | 13704
+ gateway-timeout | 3803
+ petabox-error | 3642
+ redirects-exceeded | 1427
+ spn2-cdx-lookup-failure | 1214
+ invalid-host-resolution | 505
+ bad-redirect | 153
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ body-too-large | 84
+ (20 rows)
+
+Ingest stats broken down by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------------------+----------
+ accepted | success | 1213335
+ accepted | no-pdf-link | 29292
+ accepted | redirect-loop | 12769
+ accepted | terminal-bad-status | 11264
+ accepted | no-capture | 10187
+ accepted | cdx-error | 1015
+ accepted | wayback-content-error | 757
+ accepted | wrong-mimetype | 501
+ accepted | link-loop | 407
+ accepted | wayback-error | 207
+ accepted | petabox-error | 189
+ accepted | redirects-exceeded | 125
+ accepted | null-body | 34
+ accepted | spn2-cdx-lookup-failure | 5
+ accepted | gateway-timeout | 4
+ accepted | blocked-cookie | 2
+ accepted | bad-redirect | 1
+ accepted | body-too-large | 1
+ published | success | 20196774
+ published | no-pdf-link | 2647969
+ published | redirect-loop | 1477558
+ published | terminal-bad-status | 1320013
+ published | wayback-content-error | 351931
+ published | no-capture | 297603
+ published | wrong-mimetype | 115440
+ published | link-loop | 76431
+ published | cdx-error | 18125
+ published | null-body | 17559
+ published | wayback-error | 10466
+ published | petabox-error | 2684
+ published | gateway-timeout | 1979
+ published | redirects-exceeded | 947
+ published | spn2-cdx-lookup-failure | 877
+ published | invalid-host-resolution | 457
+ published | bad-redirect | 120
+ published | spn2-error:job-failed | 77
+ published | spn2-error | 70
+ published | body-too-large | 39
+ published | bad-gzip-encoding | 24
+ published | timeout | 24
+ published | blocked-cookie | 23
+ published | spn2-error:soft-time-limit-exceeded | 4
+ published | | 2
+ published | pending | 1
+ published | spn2-error:pending | 1
+ published | too-many-redirects | 1
+ submitted | success | 5832117
+ submitted | redirect-loop | 330785
+ submitted | terminal-bad-status | 222152
+ submitted | no-capture | 170766
+ submitted | no-pdf-link | 68934
+ submitted | wrong-mimetype | 11424
+ submitted | null-body | 5576
+ submitted | wayback-content-error | 4702
+ submitted | cdx-error | 4030
+ submitted | wayback-error | 3031
+ submitted | link-loop | 2551
+ submitted | gateway-timeout | 1820
+ submitted | petabox-error | 769
+ submitted | redirects-exceeded | 355
+ submitted | spn2-cdx-lookup-failure | 332
+ submitted | invalid-host-resolution | 48
+ submitted | body-too-large | 44
+ submitted | spn2-error | 37
+ submitted | bad-redirect | 32
+ submitted | spn2-error:job-failed | 14
+ submitted | | 13
+ submitted | spn2-error:soft-time-limit-exceeded | 5
+ submitted | timeout | 4
+ submitted | bad-gzip-encoding | 3
+ submitted | skip-url-blocklist | 1
+ | no-pdf-link | 42
+ | success | 25
+ | redirect-loop | 20
+ | terminal-bad-status | 12
+ | no-capture | 3
+ (76 rows)
+
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2192376
+ no-capture | 152183
+ no-pdf-link | 144174
+ redirect-loop | 125988
+ terminal-bad-status | 67307
+ link-loop | 8292
+ wrong-mimetype | 7942
+ null-body | 2270
+ cdx-error | 1223
+ wayback-content-error | 1147
+ petabox-error | 728
+ wayback-error | 155
+ body-too-large | 82
+ invalid-host-resolution | 41
+ gateway-timeout | 28
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ timeout | 7
+ bad-redirect | 6
+ redirects-exceeded | 4
+ (20 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 2,703,999 raw ingest requests (new URLs total)
+- 1,231,582 (45.5%) of these had not been seen/crawled from any source yet
+- 843,753 (31.2%) success from new heritrix crawling
+- 2,192,376 (81.1%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)
diff --git a/notes/ingest/2021-05_daily_improvements.md b/notes/ingest/2021-05_daily_improvements.md
new file mode 100644
index 0000000..e8748fa
--- /dev/null
+++ b/notes/ingest/2021-05_daily_improvements.md
@@ -0,0 +1,480 @@
+
+Summary of top large broken domains (2021-04-21 "30 day" snapshot):
+
+## acervus.unicamp.br
+
+ domain | status | count
+---------------------------------------+-------------------------+--------
+ acervus.unicamp.br | | 1967
+ acervus.unicamp.br | no-pdf-link | 1853
+
+select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%acervus.unicamp.br%' and status = 'no-pdf-link' limit 5;
+
+http://acervus.unicamp.br/index.asp?codigo_sophia=963332
+
+seems like many of these were captures with a blank page? or a redirect to
+the homepage?
+
+http://web.archive.org/web/20200129110523/http://acervus.unicamp.br/index.html
+
+messy, going to move on.
+
+
+## apex.ipk-gatersleben.de
+
+apex.ipk-gatersleben.de | | 1253
+apex.ipk-gatersleben.de | no-pdf-link | 1132
+
+select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%apex.ipk-gatersleben.de%' and status = 'no-pdf-link' limit 5;
+
+https://doi.org/10.25642/ipk/rescoll/4886
+https://apex.ipk-gatersleben.de/apex/f?p=PGRDOI:RESOLVE:::NO:RP:DOI:10.25642/IPK/RESCOLL/7331
+
+seem to be datasets/species, not articles.
+
+prefix: 10.25642/ipk
+
+## crossref.org
+
+ apps.crossref.org | | 4693
+ apps.crossref.org | no-pdf-link | 4075
+
+https://doi.org/10.1515/9781501747045-013
+https://apps.crossref.org/coaccess/coaccess.html?doi=10.1515%2F9781501747045-013
+
+Derp, they are doing a dynamic/AJAX thing, so access links are not in the HTML.
+
+## openeditiong
+
+ books.openedition.org | | 1784
+ books.openedition.org | no-pdf-link | 1466
+
+https://doi.org/10.4000/books.pul.34492
+https://books.openedition.org/pul/34492
+
+these are not actually OA books (or at least, not all are)
+
+## chemrxiv.org (figshare)
+
+ chemrxiv.org | | 857
+ chemrxiv.org | no-pdf-link | 519
+
+https://doi.org/10.26434/chemrxiv.14411081
+https://chemrxiv.org/articles/preprint/Prediction_and_Optimization_of_Ion_Transport_Characteristics_in_Nanoparticle-Based_Electrolytes_Using_Convolutional_Neural_Networks/14411081
+
+these all seem to be *multi-file* entities, thus not good for single file ingest pipeline.
+
+## direct.mit.edu
+
+ direct.mit.edu | | 996
+ direct.mit.edu | no-pdf-link | 869
+
+https://doi.org/10.7551/mitpress/14056.003.0004
+https://direct.mit.edu/books/monograph/5111/chapter-abstract/3060134/Adding-Technology-to-Contact-Tracing?redirectedFrom=fulltext
+
+"not available"
+
+https://doi.org/10.7551/mitpress/12444.003.0004
+
+"not available"
+
+
+## dlc.library.columbia.edu
+
+ dlc.library.columbia.edu | | 4225
+ dlc.library.columbia.edu | no-pdf-link | 2395
+ dlc.library.columbia.edu | spn2-wayback-error | 1568
+
+https://doi.org/10.7916/d8-506w-kk49
+https://dlc.library.columbia.edu/durst/cul:18931zcrk9
+
+document repository.
+this one goes to IA! actually many seem to.
+added extractor, should re-ingest with:
+
+ publisher:"Columbia University" doi_prefix:10.7916 !journal:*
+
+actually, that is like 600k+ results and many are not digitized, so perhaps not.
+
+## doi.ala.org.au
+
+ doi.ala.org.au | | 2570
+ doi.ala.org.au | no-pdf-link | 2153
+
+https://doi.org/10.26197/ala.811d55e3-2ff4-4501-b3e7-e19249507052
+https://doi.ala.org.au/doi/811d55e3-2ff4-4501-b3e7-e19249507052
+
+this is a data repository, with filesets, not papers. datacite metadata is
+incorrect.
+
+## fldeploc.dep.state.fl.us
+
+ fldeploc.dep.state.fl.us | | 774
+ fldeploc.dep.state.fl.us | no-pdf-link | 718
+
+
+https://doi.org/10.35256/ic29
+http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29
+
+re-ingest with:
+
+ # only ~800 works
+ doi_prefix:10.35256 publisher:Florida
+
+## geoscan.nrcan.gc.ca
+
+ geoscan.nrcan.gc.ca | | 2056
+ geoscan.nrcan.gc.ca | no-pdf-link | 2019
+
+https://doi.org/10.4095/295366
+https://geoscan.nrcan.gc.ca/starweb/geoscan/servlet.starweb?path=geoscan/fulle.web&search1=R=295366
+
+this is a geographic repository, not papers.
+
+## kiss.kstudy.com
+
+ kiss.kstudy.com | | 747
+ kiss.kstudy.com | no-pdf-link | 686
+
+https://doi.org/10.22143/hss21.12.1.121
+http://kiss.kstudy.com/thesis/thesis-view.asp?key=3862523
+
+Korean. seems to not actually be theses? can't download.
+
+## linkinghub.elsevier.com
+
+ linkinghub.elsevier.com | | 5079
+ linkinghub.elsevier.com | forbidden | 2226
+ linkinghub.elsevier.com | spn2-wayback-error | 1625
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758
+
+skipping for now, looks like mostly 'forbidden'?
+
+## osf.io
+
+These are important!
+
+ osf.io | | 3139
+ osf.io | not-found | 2288
+ osf.io | spn2-wayback-error | 582
+
+https://doi.org/10.31219/osf.io/jux3w
+https://accounts.osf.io/login?service=https://osf.io/jux3w/download
+
+many of these are 404s by browser as well. what does that mean?
+
+## peerj.com
+
+ peerj.com | | 785
+ peerj.com | no-pdf-link | 552
+
+https://doi.org/10.7287/peerj.11155v0.1/reviews/2
+https://peerj.com/articles/11155/reviews/
+
+these are HTML reviews, not papers
+
+## preprints.jmir.org
+
+ preprints.jmir.org | | 763
+ preprints.jmir.org | no-pdf-link | 611
+
+https://doi.org/10.2196/preprints.22556
+https://preprints.jmir.org/preprint/22556
+
+UGH, looks simple, but javascript.
+
+could try to re-write URL into S3 format? meh.
+
+## psyarxiv.com (OSF?)
+
+ psyarxiv.com | | 641
+ psyarxiv.com | no-pdf-link | 546
+
+https://doi.org/10.31234/osf.io/5jaqg
+https://psyarxiv.com/5jaqg/
+
+Also infuriatingly Javascript, but can do URL hack.
+
+Should reingest, and potentially force-recrawl:
+
+ # about 67k
+ publisher:"Center for Open Science" in_ia:false
+
+## publons.com
+
+ publons.com | | 6998
+ publons.com | no-pdf-link | 6982
+
+https://doi.org/10.1002/jmor.21338/v2/review1
+https://publons.com/publon/40260824/
+
+These are just HTML reviews, not papers.
+
+## saemobilus.sae.org
+
+ saemobilus.sae.org | | 795
+ saemobilus.sae.org | no-pdf-link | 669
+
+https://doi.org/10.4271/as1426c
+https://saemobilus.sae.org/content/as1426c
+
+These seem to be standards, and are not open access (paywall)
+
+## scholar.dkyobobook.co.kr
+
+ scholar.dkyobobook.co.kr | | 1043
+ scholar.dkyobobook.co.kr | no-pdf-link | 915
+
+https://doi.org/10.22471/crisis.2021.6.1.18
+http://scholar.dkyobobook.co.kr/searchDetail.laf?barcode=4010028199536
+
+Korean. complex javascript, skipping.
+
+## unreserved.rba.gov.au
+
+ unreserved.rba.gov.au | | 823
+ unreserved.rba.gov.au | no-pdf-link | 821
+
+https://doi.org/10.47688/rba_archives_2006/04129
+https://unreserved.rba.gov.au/users/login
+
+Don't need to login when I tried in browser? document repo, not papers.
+
+## wayf.switch.ch
+
+ wayf.switch.ch | | 1169
+ wayf.switch.ch | no-pdf-link | 809
+
+https://doi.org/10.24451/arbor.11128
+https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Farbor.bfh.ch%2Fshibboleth&return=https%3A%2F%2Farbor.bfh.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A5056fc0a97aeab16e5007ca63bede254cb5669d94173064d6c74c62a0f88b022
+
+Loginwall
+
+##
+
+ www.bloomsburycollections.com | | 1745
+ www.bloomsburycollections.com | no-pdf-link | 1571
+
+https://doi.org/10.5040/9781849664264.0008
+https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries
+
+These are primarily not OA/available.
+
+##
+
+ www.emc2020.eu | | 791
+ www.emc2020.eu | no-pdf-link | 748
+
+https://doi.org/10.22443/rms.emc2020.146
+https://www.emc2020.eu/abstract/evaluation-of-different-rectangular-scan-strategies-for-hrstem-imaging.html
+
+These are just abstracts, not papers.
+
+## Emerald
+
+ www.emerald.com | | 2420
+ www.emerald.com | no-pdf-link | 1986
+
+https://doi.org/10.1108/ramj-11-2020-0065
+https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html
+
+Note that these URLs are already HTML fulltext. but the PDF is also available and easy.
+
+re-ingest:
+
+ # only ~3k or so missing
+ doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true
+
+##
+
+ www.humankineticslibrary.com | | 1122
+ www.humankineticslibrary.com | no-pdf-link | 985
+
+https://doi.org/10.5040/9781718206625.ch-002
+https://www.humankineticslibrary.com/encyclopedia-chapter?docid=b-9781718206625&tocid=b-9781718206625-chapter2
+
+paywall
+
+##
+
+ www.inderscience.com | | 1532
+ www.inderscience.com | no-pdf-link | 1217
+
+https://doi.org/10.1504/ijdmb.2020.10036342
+https://www.inderscience.com/info/ingeneral/forthcoming.php?jcode=ijdmb
+
+paywall
+
+##
+
+ www.ingentaconnect.com | | 885
+ www.ingentaconnect.com | no-pdf-link | 783
+
+https://doi.org/10.15258/sst.2021.49.1.07
+https://www.ingentaconnect.com/content/ista/sst/pre-prints/content-7_sst.2021.49.1_63-71;jsessionid=1joc5mmi1juht.x-ic-live-02
+
+Annoying javascript, but easy to work around.
+
+re-ingest:
+
+ # only a couple hundred; also re-ingest
+ doi_prefix:10.15258 in_ia:false year:>2018
+
+##
+
+ www.nomos-elibrary.de | | 2235
+ www.nomos-elibrary.de | no-pdf-link | 1128
+ www.nomos-elibrary.de | spn2-wayback-error | 559
+
+https://doi.org/10.5771/9783748907084-439
+https://www.nomos-elibrary.de/10.5771/9783748907084-439/verzeichnis-der-autorinnen-und-autoren
+
+Javascript obfuscated download button?
+
+##
+
+ www.oecd-ilibrary.org | | 3046
+ www.oecd-ilibrary.org | no-pdf-link | 2869
+
+https://doi.org/10.1787/543e84ed-en
+https://www.oecd-ilibrary.org/development/applying-evaluation-criteria-thoughtfully_543e84ed-en
+
+Paywall.
+
+##
+
+ www.osapublishing.org | | 821
+ www.osapublishing.org | no-pdf-link | 615
+
+https://doi.org/10.1364/boe.422199
+https://www.osapublishing.org/boe/abstract.cfm?doi=10.1364/BOE.422199
+
+Some of these are "pre-registered" DOIs, not published yet. Many of the
+remaining are actually HTML articles, and/or have some stuff in the
+`citation_pdf_url`. A core problem is captchas.
+
+Have started adding support to fatcat for HTML crawl type based on container.
+
+re-ingest:
+
+ container_twtpsm6ytje3nhuqfu3pa7ca7u (optica)
+ container_cg4vcsfty5dfvgmat5wm62wgie (optics express)
+
+##
+
+ www.oxfordscholarlyeditions.com | | 759
+ www.oxfordscholarlyeditions.com | no-pdf-link | 719
+
+https://doi.org/10.1093/oseo/instance.00266789
+https://www.oxfordscholarlyeditions.com/view/10.1093/actrade/9780199593668.book.1/actrade-9780199593668-div1-27
+
+loginwall/paywall
+
+##
+
+ www.schweizerbart.de | | 730
+ www.schweizerbart.de | no-pdf-link | 653
+
+https://doi.org/10.1127/zfg/40/1996/461
+https://www.schweizerbart.de/papers/zfg/detail/40/97757/Theoretical_model_of_surface_karstic_processes?af=crossref
+
+paywall
+
+##
+
+ www.sciencedirect.com | | 14757
+ www.sciencedirect.com | no-pdf-link | 12733
+ www.sciencedirect.com | spn2-wayback-error | 1503
+
+https://doi.org/10.1016/j.landurbplan.2021.104104
+https://www.sciencedirect.com/science/article/pii/S0169204621000670
+
+Bunch of crazy new hacks, but seems to be working!
+
+re-ingest:
+
+ # to start! about 50k
+ doi_prefix:10.1016 is_oa:true year:2021
+
+##
+
+ www.sciendo.com | | 1955
+ www.sciendo.com | no-pdf-link | 1176
+
+https://doi.org/10.2478/awutm-2019-0012
+https://www.sciendo.com/article/10.2478/awutm-2019-0012
+
+uses lots of javascript, hard to scrape.
+
+
+## Others (for reference)
+
+ | | 725990
+ | no-pdf-link | 209933
+ | success | 206134
+ | spn2-wayback-error | 127015
+ | spn2-cdx-lookup-failure | 53384
+ | blocked-cookie | 35867
+ | link-loop | 25834
+ | too-many-redirects | 16430
+ | redirect-loop | 14648
+ | forbidden | 13794
+ | terminal-bad-status | 8055
+ | not-found | 6399
+ | remote-server-error | 2402
+ | wrong-mimetype | 2011
+ | spn2-error:unauthorized | 912
+ | bad-redirect | 555
+ | read-timeout | 530
+
+## Re-ingests
+
+All the above combined:
+
+ container_twtpsm6ytje3nhuqfu3pa7ca7u (optica)
+ container_cg4vcsfty5dfvgmat5wm62wgie (optics express)
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id twtpsm6ytje3nhuqfu3pa7ca7u
+ => Counter({'ingest_request': 1142, 'elasticsearch_release': 1142, 'estimate': 1142, 'kafka': 1142})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id cg4vcsfty5dfvgmat5wm62wgie
+ => Counter({'elasticsearch_release': 33482, 'estimate': 33482, 'ingest_request': 32864, 'kafka': 32864})
+
+ # only ~800 works
+ doi_prefix:10.35256 publisher:Florida
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query "doi_prefix:10.35256 publisher:Florida"
+ => Counter({'ingest_request': 843, 'elasticsearch_release': 843, 'estimate': 843, 'kafka': 843})
+
+ # only ~3k or so missing
+ doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1108 publisher:emerald"
+ => Counter({'ingest_request': 3812, 'elasticsearch_release': 3812, 'estimate': 3812, 'kafka': 3812})
+
+
+ # only a couple hundred; also re-ingest
+ doi_prefix:10.15258 in_ia:false year:>2018
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl query "doi_prefix:10.15258 year:>2018"
+ => Counter({'ingest_request': 140, 'elasticsearch_release': 140, 'estimate': 140, 'kafka': 140})
+
+ # to start! about 50k
+ doi_prefix:10.1016 is_oa:true year:2020
+ doi_prefix:10.1016 is_oa:true year:2021
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2020"
+ => Counter({'ingest_request': 75936, 'elasticsearch_release': 75936, 'estimate': 75936, 'kafka': 75936})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2021"
+ => Counter({'ingest_request': 54824, 'elasticsearch_release': 54824, 'estimate': 54824, 'kafka': 54824})
+
+ pmcid:* year:2018
+ pmcid:* year:2019
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2018"
+ => Counter({'ingest_request': 25366, 'elasticsearch_release': 25366, 'estimate': 25366, 'kafka': 25366})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2019"
+ => Counter({'ingest_request': 55658, 'elasticsearch_release': 55658, 'estimate': 55658, 'kafka': 55658})
+
diff --git a/notes/ingest/2021-07_unpaywall.md b/notes/ingest/2021-07_unpaywall.md
new file mode 100644
index 0000000..8b6ac09
--- /dev/null
+++ b/notes/ingest/2021-07_unpaywall.md
@@ -0,0 +1,320 @@
+
+New snapshot released 2021-07-02. Should be "boring" ingest and crawl.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02T151134.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json
+ => 32.2M 3:01:52 [2.95k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 32196260, 'insert-requests': 3325954, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 32196260, 'pushed': 32196260})
+
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json';
+ => COPY 3556146
+
+ # previous, 2020-10 run: COPY 4216339
+ # previous, 2021-07 run: COPY 3277484
+
+Oops, should have run instead, with the date filter:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json';
+
+But didn't, so processed all instead.
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json
+ => 3.56M 0:01:59 [29.8k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done, on 2021-07-13
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 1831827
+ success | 1343604
+ redirect-loop | 103999
+ terminal-bad-status | 19845
+ no-pdf-link | 17448
+ link-loop | 5027
+ wrong-mimetype | 2270
+ cdx-error | 523
+ body-too-large | 321
+ null-body | 298
+ wayback-content-error | 242
+ petabox-error | 155
+ gateway-timeout | 138
+ invalid-host-resolution | 120
+ wayback-error | 109
+ blocked-cookie | 9
+ timeout | 7
+ | 3
+ bad-redirect | 3
+ spn2-cdx-lookup-failure | 3
+ (20 rows)
+
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json';
+ => COPY 1743186
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json
+ => 1.74M 0:01:33 [18.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.*.txt
+ 1 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt
+ 1643963 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt
+ 1644028 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt
+ 3287992 total
+
+Then run crawl (see `journal-crawls` git repo).
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1.74M 0:01:59 [14.6k/s]
+
+## Post-Ingest Stats
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2690258
+ redirect-loop | 227328
+ no-capture | 157368
+ terminal-bad-status | 118943
+ no-pdf-link | 92698
+ blocked-cookie | 19478
+ link-loop | 9249
+ wrong-mimetype | 4918
+ cdx-error | 1786
+ wayback-error | 1497
+ null-body | 1302
+ body-too-large | 433
+ wayback-content-error | 245
+ petabox-error | 171
+ gateway-timeout | 138
+ invalid-host-resolution | 120
+ timeout | 12
+ bad-redirect | 4
+ | 3
+ spn2-cdx-lookup-failure | 1
+ (20 rows)
+
+Only the recent updates, by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------+---------
+ accepted | success | 103144
+ accepted | no-pdf-link | 53981
+ accepted | terminal-bad-status | 4102
+ accepted | link-loop | 2799
+ accepted | no-capture | 2315
+ accepted | redirect-loop | 2171
+ accepted | blocked-cookie | 234
+ accepted | cdx-error | 140
+ accepted | wayback-error | 101
+ accepted | wrong-mimetype | 38
+ accepted | null-body | 10
+ accepted | petabox-error | 5
+ accepted | wayback-content-error | 4
+ accepted | gateway-timeout | 2
+ accepted | body-too-large | 2
+ published | success | 1919100
+ published | no-capture | 130104
+ published | redirect-loop | 127482
+ published | terminal-bad-status | 43118
+ published | no-pdf-link | 33505
+ published | blocked-cookie | 19034
+ published | link-loop | 6241
+ published | wrong-mimetype | 4163
+ published | null-body | 1195
+ published | cdx-error | 1151
+ published | wayback-error | 1105
+ published | wayback-content-error | 197
+ published | body-too-large | 195
+ published | petabox-error | 118
+ published | gateway-timeout | 35
+ published | invalid-host-resolution | 13
+ published | timeout | 8
+ published | bad-redirect | 2
+ published | spn2-cdx-lookup-failure | 1
+ published | bad-gzip-encoding | 1
+ submitted | success | 668014
+ submitted | redirect-loop | 97675
+ submitted | terminal-bad-status | 71723
+ submitted | no-capture | 24949
+ submitted | no-pdf-link | 5212
+ submitted | wrong-mimetype | 717
+ submitted | cdx-error | 495
+ submitted | wayback-error | 291
+ submitted | body-too-large | 236
+ submitted | blocked-cookie | 210
+ submitted | link-loop | 209
+ submitted | invalid-host-resolution | 107
+ submitted | gateway-timeout | 101
+ submitted | null-body | 97
+ submitted | petabox-error | 48
+ submitted | wayback-content-error | 44
+ submitted | timeout | 4
+ submitted | | 3
+ submitted | bad-redirect | 2
+ submitted | remote-server-error | 1
+ (55 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 3,325,954 raw ingest requests (new URLs total)
+- 1,743,186 (52% of all) of these had not been seen/crawled from any source yet (?), and attempted to crawl
+- 1,346,654 (77% of crawled) success from new heritrix crawling
+- 2,690,258 (80%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)
+
+## Live Ingest Follow-Up
+
+Will run SPN requests on the ~160k `no-capture` URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json';
+ => COPY 157371
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json
+ => 157k 0:00:04 [31.6k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ => DONE
diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md
new file mode 100644
index 0000000..5f92196
--- /dev/null
+++ b/notes/ingest/2021-08_mag.md
@@ -0,0 +1,400 @@
+
+Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest.
+Also want to re-ingest some old/failed ingests, now that pipeline/code has
+improved.
+
+Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs.
+
+
+## Persist Ingest Requests
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000})
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 22.5M 0:46:00 [8.16k/s]
+ => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585})
+
+Roughly 8.6 million new URLs
+
+## Pre-Crawl Status Counts
+
+Status of combined old and new requests, with some large domains removed:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------+----------
+ success | 26123975
+ | 6664846
+ no-pdf-link | 1859908
+ redirect-loop | 1532405
+ no-capture | 1199126
+ link-loop | 1157010
+ terminal-bad-status | 832362
+ gateway-timeout | 202158
+ spn2-cdx-lookup-failure | 81406
+ wrong-mimetype | 69087
+ invalid-host-resolution | 37262
+ wayback-error | 21340
+ petabox-error | 11237
+ null-body | 9414
+ wayback-content-error | 2199
+ cdx-error | 1893
+ spn2-error | 1741
+ spn2-error:job-failed | 971
+ blocked-cookie | 902
+ spn2-error:invalid-url-syntax | 336
+ (20 rows)
+
+And just the new URLs (note that domain filter shouldn't be required, but
+keeping for consistency):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ | 6664780
+ success | 1957844
+ redirect-loop | 23357
+ terminal-bad-status | 9385
+ no-pdf-link | 8315
+ no-capture | 6892
+ link-loop | 4517
+ wrong-mimetype | 3864
+ cdx-error | 1749
+ blocked-cookie | 842
+ null-body | 747
+ wayback-error | 688
+ wayback-content-error | 570
+ gateway-timeout | 367
+ petabox-error | 340
+ spn2-cdx-lookup-failure | 150
+ read-timeout | 122
+ not-found | 119
+ invalid-host-resolution | 63
+ spn2-error | 23
+ (20 rows)
+
+## Dump Initial Bulk Ingest Requests
+
+Note that this is all-time, not just recent, and will re-process a lot of
+"no-pdf-link":
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-pdf-link'
+ OR ingest_file_result.status = 'cdx-error'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json';
+ => COPY 8526647
+
+Transform to ingest requests:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json
+ => 8.53M 0:03:40
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+Updated stats after running initial bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 5184994
+ no-capture | 3284416
+ redirect-loop | 98685
+ terminal-bad-status | 28733
+ link-loop | 28518
+ blocked-cookie | 22338
+ no-pdf-link | 19073
+ wrong-mimetype | 9122
+ null-body | 2793
+ wayback-error | 2128
+ wayback-content-error | 1233
+ cdx-error | 1198
+ petabox-error | 617
+ gateway-timeout | 395
+ not-found | 130
+ read-timeout | 128
+ | 111
+ invalid-host-resolution | 63
+ spn2-cdx-lookup-failure | 24
+ spn2-error | 20
+ (20 rows)
+
+## Generate Seedlist
+
+For crawling, do a similar (but not identical) dump:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json';
+ => COPY 4599519
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json
+ => 4.60M 0:02:55 [26.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+ cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ => DONE
+
+ wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt
+ 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+
+## Post-Crawl Bulk Re-Ingest
+
+Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by
+hash, URL agnostic).
+
+Enqueue for buik re-ingest:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => Thu 19 Aug 2021 09:10:59 PM UTC
+
+
+## Post-Ingest Stats
+
+Just the new stuff (compare against above for delta):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 7748241 89.2%
+ no-capture | 429688 4.9%
+ redirect-loop | 172831 2.0%
+ terminal-bad-status | 94029 1.1%
+ no-pdf-link | 86437 1.0%
+ blocked-cookie | 67903 0.8%
+ link-loop | 50622
+ wrong-mimetype | 21064
+ null-body | 6650
+ cdx-error | 3313
+ wayback-error | 2630
+ gateway-timeout | 399
+ petabox-error | 268
+ wayback-content-error | 170
+ not-found | 130
+ read-timeout | 128
+ | 109
+ invalid-host-resolution | 63
+ bad-redirect | 39
+ spn2-error | 20
+ (20 rows)
+
+New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397
+
+Overall success of new batch: 7748241. / 8686315 = 89.2%
+
+And combined (old and new) status again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 31990062
+ redirect-loop | 1704717
+ no-capture | 1263462
+ link-loop | 1218280
+ blocked-cookie | 1213838
+ no-pdf-link | 1096664
+ terminal-bad-status | 960070
+ gateway-timeout | 202190
+ wrong-mimetype | 86557
+ invalid-host-resolution | 37262
+ null-body | 15443
+ wayback-error | 12839
+ cdx-error | 4047
+ spn2-error | 1731
+ spn2-error:job-failed | 962
+ petabox-error | 463
+ wayback-content-error | 379
+ spn2-error:invalid-url-syntax | 336
+ spn2-error:soft-time-limit-exceeded | 203
+ | 175
+ (20 rows)
+
+New success total: 31990062 - 26123975 = 5,866,087
+
+A full 1,263,462 no-capture that could be attempted... though many of those may
+be excluded for a specific reason.
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
new file mode 100644
index 0000000..ac808dd
--- /dev/null
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -0,0 +1,1578 @@
+
+Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially
+re-crawling content which failed to ingest the first time.
+
+May fold this in with more general patch crawling.
+
+## Basic Counts
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 14145387
+ no-pdf-link | 12063022
+ no-capture | 5485640
+ redirect-loop | 2092705
+ terminal-bad-status | 747372
+ wrong-mimetype | 597219
+ link-loop | 542144
+ null-body | 93566
+ cdx-error | 19798
+ petabox-error | 17943
+ | 15283
+ wayback-error | 13897
+ gateway-timeout | 511
+ skip-url-blocklist | 184
+ wayback-content-error | 146
+ bad-redirect | 137
+ redirects-exceeded | 120
+ bad-gzip-encoding | 116
+ timeout | 80
+ blocked-cookie | 64
+ (20 rows)
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 40;
+
+
+ oai_prefix | success | total
+ ---------------------------+---------+---------
+ repec | 1133175 | 2783448
+ hal | 573218 | 1049607
+ www.irgrid.ac.cn | 18007 | 748828
+ cds.cern.ch | 74078 | 688091
+ americanae.aecid.es | 71310 | 572792
+ juser.fz-juelich.de | 23026 | 518551
+ espace.library.uq.edu.au | 6649 | 508960
+ igi.indrastra.com | 59629 | 478577
+ archive.ugent.be | 65306 | 424014
+ hrcak.srce.hr | 404085 | 414897
+ zir.nsk.hr | 156753 | 397200
+ renati.sunedu.gob.pe | 79362 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7997 | 354529
+ generic.eprints.org | 263566 | 340470
+ invenio.nusl.cz | 6340 | 325867
+ evastar-karlsruhe.de | 62282 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ diva.org | 67917 | 298348
+ t2r2.star.titech.ac.jp | 1085 | 289388
+ edpsciences.org | 139495 | 284972
+ repository.ust.hk | 10245 | 283417
+ revues.org | 151156 | 277497
+ pure.atira.dk | 13492 | 260754
+ bibliotecadigital.jcyl.es | 50606 | 254134
+ escholarship.org/ark | 140835 | 245203
+ ojs.pkp.sfu.ca | 168029 | 229387
+ lup.lub.lu.se | 49358 | 226602
+ library.wur.nl | 15051 | 216738
+ digitalrepository.unm.edu | 111704 | 211749
+ infoscience.tind.io | 60166 | 207299
+ edoc.mpg.de | 0 | 205252
+ erudit.org | 168490 | 197803
+ delibra.bg.polsl.pl | 38666 | 196652
+ n/a | 0 | 193814
+ aleph.bib-bvb.de | 4349 | 186666
+ serval.unil.ch | 41643 | 186372
+ orbi.ulg.ac.be | 2400 | 184551
+ digitalcommons.unl.edu | 144025 | 184372
+ bib-pubdb1.desy.de | 33525 | 182717
+ (40 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 50;
+
+ oai_prefix | status | count
+ ---------------------------+---------------+---------
+ repec | success | 1133175
+ repec | no-pdf-link | 638105
+ hal | success | 573218
+ cds.cern.ch | no-capture | 540380
+ repec | redirect-loop | 516451
+ juser.fz-juelich.de | no-pdf-link | 477881
+ americanae.aecid.es | no-pdf-link | 417766
+ hrcak.srce.hr | success | 404085
+ www.irgrid.ac.cn | no-pdf-link | 370908
+ hal | no-pdf-link | 359252
+ www.irgrid.ac.cn | no-capture | 355532
+ espace.library.uq.edu.au | no-pdf-link | 320479
+ igi.indrastra.com | no-pdf-link | 318242
+ repec | no-capture | 316981
+ invenio.nusl.cz | no-pdf-link | 309802
+ rour.neicon.ru | redirect-loop | 300911
+ hypotheses.org | no-pdf-link | 300251
+ renati.sunedu.gob.pe | no-capture | 282800
+ t2r2.star.titech.ac.jp | no-pdf-link | 272045
+ generic.eprints.org | success | 263566
+ quod.lib.umich.edu | no-pdf-link | 259661
+ archive.ugent.be | no-capture | 256127
+ evastar-karlsruhe.de | no-pdf-link | 248939
+ zir.nsk.hr | link-loop | 226919
+ repository.ust.hk | no-pdf-link | 208569
+ edoc.mpg.de | no-pdf-link | 199758
+ bibliotecadigital.jcyl.es | no-pdf-link | 188433
+ orbi.ulg.ac.be | no-pdf-link | 172373
+ diva.org | no-capture | 171115
+ lup.lub.lu.se | no-pdf-link | 168652
+ erudit.org | success | 168490
+ ojs.pkp.sfu.ca | success | 168029
+ lib.dr.iastate.edu | success | 158494
+ zir.nsk.hr | success | 156753
+ digital.kenyon.edu | success | 154900
+ revues.org | success | 151156
+ books.openedition.org | no-pdf-link | 149607
+ freidok.uni-freiburg.de | no-pdf-link | 146837
+ digitalcommons.unl.edu | success | 144025
+ escholarship.org/ark | success | 140835
+ culeuclid | link-loop | 140291
+ edpsciences.org | success | 139495
+ serval.unil.ch | no-pdf-link | 138644
+ bib-pubdb1.desy.de | no-pdf-link | 133815
+ krm.or.kr | no-pdf-link | 132461
+ pure.atira.dk | no-pdf-link | 132179
+ oai-gms.dimdi.de | redirect-loop | 131409
+ aleph.bib-bvb.de | no-capture | 128261
+ library.wur.nl | no-pdf-link | 124718
+ lirias2repo.kuleuven.be | no-capture | 123106
+ (50 rows)
+
+Note: could just delete the "excluded" rows? and not harvest them in the
+future, and filter them at ingest time (in transform script).
+
+
+
+## Investigate no-pdf-link sandcrawler improvements
+
+Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works:
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%'
+ ORDER BY random()
+ LIMIT 10;
+
+Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works):
+
+ \x auto
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ ORDER BY random()
+ LIMIT 30;
+
+### repec (SKIP-PREFIX)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35
+base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html
+terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647
+base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf
+terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75
+base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec
+terminal_url | https://www.jstor.org/stable/1884373
+
+Huh! This is just a catalog of other domains. Should probably skip
+
+DONE: skip/filter repec
+
+### juser.fz-juelich.de (SCOPE)
+
+-[ RECORD 1 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:132217
+base_url | http://juser.fz-juelich.de/record/132217
+terminal_url | http://juser.fz-juelich.de/record/132217
+
+Poster; no files.
+
+-[ RECORD 2 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:268598
+base_url | http://juser.fz-juelich.de/record/268598
+terminal_url | http://juser.fz-juelich.de/record/268598
+
+Journal.
+
+-[ RECORD 3 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:126613
+base_url | http://juser.fz-juelich.de/record/126613
+terminal_url | http://juser.fz-juelich.de/record/126613
+
+-[ RECORD 4 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:67362
+base_url | http://juser.fz-juelich.de/record/67362
+terminal_url | http://juser.fz-juelich.de/record/67362
+-[ RECORD 5 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:869189
+base_url | http://juser.fz-juelich.de/record/869189
+terminal_url | http://juser.fz-juelich.de/record/869189
+-[ RECORD 6 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:810746
+base_url | http://juser.fz-juelich.de/record/810746
+terminal_url | http://juser.fz-juelich.de/record/810746
+-[ RECORD 7 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:52897
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+-[ RECORD 8 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:114755
+base_url | http://juser.fz-juelich.de/record/114755
+terminal_url | http://juser.fz-juelich.de/record/114755
+-[ RECORD 9 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:58025
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+
+The search URLs seem redundant? Not going to try to handle those.
+
+"Powered by Invenio v1.1.7"
+
+All of these examples seem to be not papers. Maybe we can filter these better
+at the harvest or transform stage?
+
+### americanae.aecid.es (MIXED)
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:502896
+base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+
+just a metadata record? links to redalyc
+
+METADATA-ONLY
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:534600
+base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:524567
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+
+NOT-FOUND (404)
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:378914
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+
+Some single-page image archival thing? bespoke, skipping.
+
+SKIP-BESPOKE
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:526142
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+
+NOT-FOUND (404)
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:373408
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+
+NOT-FOUND (404)
+
+### www.irgrid.ac.cn (SKIP-PREFIX)
+
+Chinese Academy of Sciences Institutional Repositories Grid
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1749980
+base_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+
+Can't access
+
+FORBIDDEN
+
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/857397
+base_url | http://www.irgrid.ac.cn/handle/1471x/857397
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397
+
+Just linking to another IR; skip it.
+
+http://ir.ipe.ac.cn/handle/122111/10608
+
+requires login
+
+DONE: '/password-login;jsessionid' as a loginwall URL pattern
+ http://ir.ipe.ac.cn/handle/122111/10608
+ http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf
+
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1060447
+base_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1671377
+base_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1178430
+base_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2488017
+base_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/977147
+base_url | http://www.irgrid.ac.cn/handle/1471x/977147
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2454503
+base_url | http://ir.nwipb.ac.cn/handle/363003/9957
+terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957
+
+this domain is a disapointment :(
+
+should continue crawling, as the metadata is open and good. but won't get fulltext?
+
+### hal (FIXED-PARTIAL)
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00744951v1
+base_url | https://hal.archives-ouvertes.fr/hal-00744951
+terminal_url | https://hal.archives-ouvertes.fr/hal-00744951
+
+Off-site OA link.
+
+FIXED-HAL
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-01065398v1
+base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf
+terminal_url | https://hal.archives-ouvertes.fr/index/index
+-[ RECORD 3 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:lirmm-00371599v1
+base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+
+To elsevier :(
+
+-[ RECORD 4 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00284780v1
+base_url | https://hal.archives-ouvertes.fr/hal-00284780
+terminal_url | https://hal.archives-ouvertes.fr/hal-00284780
+
+METADATA-ONLY
+
+-[ RECORD 5 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00186151v1
+base_url | https://hal.archives-ouvertes.fr/hal-00186151
+terminal_url | https://hal.archives-ouvertes.fr/hal-00186151
+
+METADATA-ONLY
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00399754v1
+base_url | https://hal.archives-ouvertes.fr/hal-00399754
+terminal_url | https://hal.archives-ouvertes.fr/hal-00399754
+
+METADATA-ONLY
+
+
+### espace.library.uq.edu.au (SKIP)
+
+-[ RECORD 1 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:136497
+base_url | https://espace.library.uq.edu.au/view/UQ:136497
+terminal_url | https://espace.library.uq.edu.au/view/UQ:136497
+-[ RECORD 2 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:411389
+base_url | https://espace.library.uq.edu.au/view/UQ:411389
+terminal_url | https://espace.library.uq.edu.au/view/UQ:411389
+-[ RECORD 3 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:401773
+base_url | https://espace.library.uq.edu.au/view/UQ:401773
+terminal_url | https://espace.library.uq.edu.au/view/UQ:401773
+-[ RECORD 4 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:675334
+base_url | https://espace.library.uq.edu.au/view/UQ:675334
+terminal_url | https://espace.library.uq.edu.au/view/UQ:675334
+-[ RECORD 5 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:312311
+base_url | https://espace.library.uq.edu.au/view/UQ:312311
+terminal_url | https://espace.library.uq.edu.au/view/UQ:312311
+-[ RECORD 6 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:209401
+base_url | https://espace.library.uq.edu.au/view/UQ:209401
+terminal_url | https://espace.library.uq.edu.au/view/UQ:209401
+-[ RECORD 7 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:327188
+base_url | https://espace.library.uq.edu.au/view/UQ:327188
+terminal_url | https://espace.library.uq.edu.au/view/UQ:327188
+
+Very javascript heavy (skeletal HTML). And just links to fulltext on publisher
+sites.
+
+### igi.indrastra.com (METADATA-ONLY)
+
+-[ RECORD 1 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:267221
+base_url | http://igi.indrastra.com/items/show/267221
+terminal_url | http://igi.indrastra.com/items/show/267221
+-[ RECORD 2 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:181799
+base_url | http://igi.indrastra.com/items/show/181799
+terminal_url | http://igi.indrastra.com/items/show/181799
+-[ RECORD 3 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:125382
+base_url | http://igi.indrastra.com/items/show/125382
+terminal_url | http://igi.indrastra.com/items/show/125382
+-[ RECORD 4 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:47266
+base_url | http://igi.indrastra.com/items/show/47266
+terminal_url | http://igi.indrastra.com/items/show/47266
+-[ RECORD 5 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:12872
+base_url | http://igi.indrastra.com/items/show/12872
+terminal_url | http://igi.indrastra.com/items/show/12872
+-[ RECORD 6 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:231620
+base_url | http://igi.indrastra.com/items/show/231620
+terminal_url | http://igi.indrastra.com/items/show/231620
+
+"Proudly powered by Omeka"
+
+### invenio.nusl.cz (METADATA-ONLY)
+
+ oai_id | base_url | terminal_url
+----------------------------+------------------------------------+--------------------------------------
+ oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409
+ oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783
+ oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961
+ oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800
+ oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695
+ oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393
+ oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987
+ oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396
+ oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512
+ oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631
+
+Metadata only (at least this set)
+
+### hypotheses.org
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:mittelalter/9529
+base_url | http://mittelalter.hypotheses.org/9529
+terminal_url | https://mittelalter.hypotheses.org/9529
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/18638
+base_url | http://archivalia.hypotheses.org/18638
+terminal_url | https://archivalia.hypotheses.org/18638
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/13614
+base_url | http://archivalia.hypotheses.org/13614
+terminal_url | https://archivalia.hypotheses.org/13614
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:teteschercheuses/2785
+base_url | http://teteschercheuses.hypotheses.org/2785
+terminal_url | https://teteschercheuses.hypotheses.org/2785
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:altervsego/608
+base_url | http://altervsego.hypotheses.org/608
+terminal_url | http://altervsego.hypotheses.org/608
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivewk1/21905
+base_url | http://archivewk1.hypotheses.org/21905
+terminal_url | https://archivewk1.hypotheses.org/21905
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:slkdiaspo/3321
+base_url | http://slkdiaspo.hypotheses.org/3321
+terminal_url | https://slkdiaspo.hypotheses.org/3321
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:diga/280
+base_url | http://diga.hypotheses.org/280
+terminal_url | https://diga.hypotheses.org/280
+
+These are all a big mix... basically blogs. Should continue crawling, but expect no yield.
+
+### t2r2.star.titech.ac.jp (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00105099
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00101346
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50161100
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00232407
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50120040
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50321440
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50235666
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+
+
+### quod.lib.umich.edu
+
+-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2
+base_url | http://name.umdl.umich.edu/acf2679.0015.003
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:b14970.0001.001
+base_url | http://name.umdl.umich.edu/B14970.0001.001
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3
+base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43
+base_url | http://name.umdl.umich.edu/acg2248.1-16.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9
+base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9
+base_url | http://name.umdl.umich.edu/acg1336.1-24.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a
+base_url | http://name.umdl.umich.edu/africanamer.0002.32a
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a
+
+These are... issues of journals? Should continue to crawl, but not expect much.
+
+### evastar-karlsruhe.de (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:270011444
+base_url | https://publikationen.bibliothek.kit.edu/270011444
+terminal_url | https://publikationen.bibliothek.kit.edu/270011444
+-[ RECORD 2 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000050117
+base_url | https://publikationen.bibliothek.kit.edu/1000050117
+terminal_url | https://publikationen.bibliothek.kit.edu/1000050117
+-[ RECORD 3 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:362296
+base_url | https://publikationen.bibliothek.kit.edu/362296
+terminal_url | https://publikationen.bibliothek.kit.edu/362296
+-[ RECORD 4 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:23042000
+base_url | https://publikationen.bibliothek.kit.edu/23042000
+terminal_url | https://publikationen.bibliothek.kit.edu/23042000
+-[ RECORD 5 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000069945
+base_url | https://publikationen.bibliothek.kit.edu/1000069945
+terminal_url | https://publikationen.bibliothek.kit.edu/1000069945
+
+
+### repository.ust.hk
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-67233
+base_url | http://repository.ust.hk/ir/Record/1783.1-67233
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-63232
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017
+terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-2891
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103
+terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-56231
+base_url | http://repository.ust.hk/ir/Record/1783.1-56231
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231
+
+[...]
+
+-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-24872
+base_url | http://repository.ust.hk/ir/Record/1783.1-24872
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872
+-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-3457
+base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-73215
+base_url | http://repository.ust.hk/ir/Record/1783.1-73215
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215
+
+DONE: gateway.isiknowledge.com is bogus/blocking?
+
+
+### edoc.mpg.de (SKIP-DEPRECATED)
+
+ oai_id | base_url | terminal_url
+------------------------+---------------------------+---------------------------
+ oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650
+ oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195
+ oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655
+ oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179
+ oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141
+ oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412
+ oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531
+ oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047
+ oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650
+ oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852
+
+This whole instance seems to have been replaced
+
+### bibliotecadigital.jcyl.es (SKIP-DIGITIZED)
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000039962
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+-[ RECORD 2 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14075
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+-[ RECORD 3 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:4842
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+-[ RECORD 4 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14799
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+-[ RECORD 5 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:821
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+
+Digitized images as pages; too much to deal with for now.
+
+### orbi.ulg.ac.be
+
+-[ RECORD 1 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/128079
+base_url | https://orbi.uliege.be/handle/2268/128079
+terminal_url | https://orbi.uliege.be/handle/2268/128079
+-[ RECORD 2 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/67659
+base_url | https://orbi.uliege.be/handle/2268/67659
+terminal_url | https://orbi.uliege.be/handle/2268/67659
+-[ RECORD 3 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/35521
+base_url | https://orbi.uliege.be/handle/2268/35521
+terminal_url | https://orbi.uliege.be/handle/2268/35521
+-[ RECORD 4 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/107922
+base_url | https://orbi.uliege.be/handle/2268/107922
+terminal_url | https://orbi.uliege.be/handle/2268/107922
+-[ RECORD 5 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/215694
+base_url | https://orbi.uliege.be/handle/2268/215694
+terminal_url | https://orbi.uliege.be/handle/2268/215694
+
+Described below.
+
+### library.wur.nl (FIXED-BESPOKE)
+
+ oai_id | base_url | terminal_url
+ -----------------------------------+------------------------------------------------+------------------------------------------------
+ oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939
+ oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707
+ oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208
+ oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378
+ oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416
+ oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930
+ oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076
+ oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109
+ oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146
+ oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922
+ (10 rows)
+
+Seems like a one-off site? But added a pattern.
+
+### pure.atira.dk
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38
+base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694
+terminal_url | https://www.tandfonline.com/action/cookieAbsent
+-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+
+Metadata only
+
+DONE: /cookieAbsent is cookie block
+ https://www.tandfonline.com/action/cookieAbsent
+
+### bib-pubdb1.desy.de (FIXED-INVENIO)
+
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:96756
+base_url | http://bib-pubdb1.desy.de/record/96756
+terminal_url | http://bib-pubdb1.desy.de/record/96756
+
+Metadata only.
+
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:416556
+base_url | http://bib-pubdb1.desy.de/record/416556
+terminal_url | http://bib-pubdb1.desy.de/record/416556
+
+Fixed!
+
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:414545
+base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:170169
+base_url | http://bib-pubdb1.desy.de/record/170169
+terminal_url | http://bib-pubdb1.desy.de/record/170169
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:191154
+base_url | http://bib-pubdb1.desy.de/record/191154
+terminal_url | http://bib-pubdb1.desy.de/record/191154
+
+Metadata only
+
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:155092
+base_url | http://bib-pubdb1.desy.de/record/155092
+terminal_url | http://bib-pubdb1.desy.de/record/155092
+
+Fixed!
+
+-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:97158
+base_url | http://bib-pubdb1.desy.de/record/97158
+terminal_url | http://bib-pubdb1.desy.de/record/97158
+
+Metadata only
+
+"Powered by Invenio v1.1.7"
+
+Can/should skip the "search" URLs
+
+### serval.unil.ch
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_60346fc75171
+base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_4db47fc4b593
+base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_57aac24fe115
+base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_deabae6baf6c
+base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_a5ec0df1370f
+base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_080300c2e23c
+base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_de777dd2b07f
+base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F
+-[ RECORD 8 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_5e824e244c27
+base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27
+
+Metadata only? See elsewhere.
+
+### Random Links
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dbc.wroc.pl:41031
+base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+
+This is some platform/package thing. PDF is in an iframe. Platform is "DLibra".
+FIXED-DLIBRA
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/174291
+base_url | https://orbi.uliege.be/handle/2268/174291
+terminal_url | https://orbi.uliege.be/handle/2268/174291
+
+DSpace platform. There are multiple files, and little to "select" on.
+
+https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with
+
+PARTIAL-DSPACE
+
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.tue.nl:664163
+base_url | http://repository.tue.nl/664163
+terminal_url | http://repository.tue.nl/664163
+
+Ah, this is the Pure platform from Elsevier.
+Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance
+
+FIXED-PURE
+
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:49579
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+
+(handled above)
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/97937
+base_url | https://orcid.org/0000-0002-2066-2082
+terminal_url | https://orcid.org/0000-0002-2066-2082
+
+ORCID! Skip it.
+
+DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time.
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:edoc.mpg.de:360269
+base_url | http://edoc.mpg.de/360269
+terminal_url | http://edoc.mpg.de/360269
+
+Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure?
+
+DONE: edoc.mpg.de -> pure.mpg.de
+
+-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:books.openedition.org:msha/17716
+base_url | http://books.openedition.org/msha/17716
+terminal_url | https://books.openedition.org/msha/17716
+
+Open edition is free to read HTML, but not PDF (or epub, etc).
+
+TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest)
+
+HTML-WORKED
+
+-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epub.oeaw.ac.at:0x003aba48
+base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+
+requires login
+
+FORBIDDEN
+
+-[ RECORD 9 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/88986
+base_url | https://orcid.org/0000-0002-4147-2560
+terminal_url | https://orcid.org/0000-0002-4147-2560
+
+DONE: skip orcids
+
+-[ RECORD 10 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-28786
+base_url | http://repository.ust.hk/ir/Record/1783.1-28786
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786
+
+Generator: VuFind 5.1.1
+just a metadata record
+
+METADATA-ONLY
+
+-[ RECORD 11 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:rcin.org.pl:50797
+base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+
+Seems like a software platform? not sure.
+
+METADATA-ONLY
+
+-[ RECORD 12 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dea.lib.unideb.hu:2437/69641
+base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+
+-[ RECORD 13 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871
+base_url | http://handle.unsw.edu.au/1959.4/64871
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L
+
+-[ RECORD 14 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.wbc.poznan.pl:225930
+base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+
+SOFT-404
+
+-[ RECORD 15 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.erciyes.edu.tr:105
+base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105
+terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105
+
+GONE (domain not registered)
+
+-[ RECORD 16 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:37500
+base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+
+Seems like a bespoke site
+
+SKIP-BESPOKE
+
+-[ RECORD 17 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50401364
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+
+METADATA-ONLY
+
+-[ RECORD 18 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epubs.cclrc.ac.uk:work/4714
+base_url | http://purl.org/net/epubs/work/4714
+terminal_url | https://epubs.stfc.ac.uk/work/4714
+
+It's got a purl! haha.
+
+METADATA-ONLY
+
+------
+
+Another batch! With some repeat domains removed.
+
+-[ RECORD 1 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc
+base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc
+terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov
+
+SKIP
+
+-[ RECORD 2 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-05302014-183910
+base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+
+Some software platform? Pretty basic/bespoke
+
+FIXED-PARTIAL
+
+-[ RECORD 3 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000098246
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+
+SKIP (see elsewhere)
+
+-[ RECORD 7 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:elektra.cdaea.es:documento.29259
+base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+
+Photo.
+
+SKIP-SCOPE
+
+-[ RECORD 9 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829
+base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L
+
+METADATA-ONLY
+
+-[ RECORD 12 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a
+base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+
+unsure
+
+-[ RECORD 16 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/369344
+base_url | https://library.wur.nl/WebQuery/wurpubs/369344
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344
+
+this specific record not OA (but site is fine/fixed)
+
+-[ RECORD 17 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:escholarship.umassmed.edu:oapubs-2146
+base_url | https://escholarship.umassmed.edu/oapubs/1147
+terminal_url | http://escholarship.umassmed.edu/oapubs/1147/
+
+just links to publisher (no content in repo)
+
+-[ RECORD 18 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010
+base_url | https://digitalcommons.usu.edu/wild_facpub/11
+terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/
+
+also just links to publisher (no content in repo)
+
+-[ RECORD 25 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:igi.indrastra.com:306768
+base_url | http://igi.indrastra.com/items/show/306768
+terminal_url | http://igi.indrastra.com/items/show/306768
+
+(see elsewhere)
+
+-[ RECORD 26 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:fau.digital.flvc.org:fau_9804
+base_url | http://purl.flvc.org/fcla/dt/12932
+terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804
+
+Islandora.
+
+-[ RECORD 27 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.lu.lv:7/16019
+base_url | https://dspace.lu.lv/dspace/handle/7/16019
+terminal_url | https://dspace.lu.lv/dspace/handle/7/16019
+
+LOGINWALL
+
+-[ RECORD 28 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:zir.nsk.hr:umas_218
+base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+
+REMOVED
+
+
+-[ RECORD 29 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:36390
+base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+
+Book, with chapters, not an individual work.
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:krm.or.kr:10056135m201r
+base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y
+terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135
+
+research results repository; keep crawling
+
+SKIP-SCOPE
+
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.db-thueringen.de:dbt_mods_00005191
+base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+
+powered by "MyCoRe"
+
+FIXED-MYCORE
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405
+base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+
+seems to be a general purpose regional library? not research-specific
+
+SKIP-UNSURE
+
+-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-02272019-123644
+base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+
+This specific URL is not available (FORBIDDEN)
+
+others have multiple files, not just a single PDF:
+https://etd.adm.unipi.it/t/etd-09102013-124430/
+
+SKIP-UNSURE
+
+-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:commons.ln.edu.hk:sw_master-5408
+base_url | https://commons.ln.edu.hk/sw_master/4408
+terminal_url | https://commons.ln.edu.hk/sw_master/4408/
+
+worth crawling I guess
+
+METADATA-ONLY
+
+-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:mouseion.jax.org:ssbb1976-1224
+base_url | https://mouseion.jax.org/ssbb1976/225
+terminal_url | https://mouseion.jax.org/ssbb1976/225/
+
+METADATA-ONLY
+
+-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aleph.bib-bvb.de:bvb01-016604343
+base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer
+terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true
+
+SOFT-404 / FORBIDDEN (cookie timeout)
+
+-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bivaldi.gva.es:11740
+base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+
+
+-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/443282
+base_url | https://library.wur.nl/WebQuery/wurpubs/443282
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282
+
+DIGIBIS platform (like some others)
+
+FIXED-PARTIAL
+
+-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:hal:in2p3-00414135v1
+base_url | http://hal.in2p3.fr/in2p3-00414135
+terminal_url | http://hal.in2p3.fr:80/in2p3-00414135
+
+METADATA-ONLY
+
+-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aaltodoc.aalto.fi:123456789/13201
+base_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+
+This specific record is not accessible.
+Another: https://aaltodoc.aalto.fi/handle/123456789/38002
+
+DSpace 5.4
+
+Worked (from recent changes)
+
+
+-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:sedici.unlp.edu.ar:10915/40144
+base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+
+This is a journal! Cool. Plone software platform.
+
+FIXED
+
+## Top no-capture Domains
+
+Top terminal no-capture domains:
+
+ SELECT domain, COUNT(domain)
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | count
+ -----------------------------------+-------
+ digitalrepository.unm.edu | 94087
+ escholarship.org | 80632
+ ir.opt.ac.cn | 70504
+ idus.us.es | 67908
+ www.cambridge.org | 56376
+ www.ssoar.info | 52534
+ rep.bntu.by | 52127
+ scholarworks.umt.edu | 48546
+ publikationen.ub.uni-frankfurt.de | 46987
+ dk.um.si | 45753
+ repositorio.uladech.edu.pe | 37028
+ uu.diva-portal.org | 34929
+ digitalcommons.law.byu.edu | 31732
+ sedici.unlp.edu.ar | 31233
+ elib.sfu-kras.ru | 29131
+ jyx.jyu.fi | 28144
+ www.repository.cam.ac.uk | 27728
+ nagoya.repo.nii.ac.jp | 26673
+ www.duo.uio.no | 25258
+ www.persee.fr | 24968
+ www2.senado.leg.br | 24426
+ tesis.ucsm.edu.pe | 24049
+ digitalcommons.unl.edu | 21974
+ www.degruyter.com | 21940
+ www.igi-global.com | 20736
+ thekeep.eiu.edu | 20712
+ docs.lib.purdue.edu | 20538
+ repositorio.cepal.org | 20280
+ elib.bsu.by | 19620
+ minds.wisconsin.edu | 19473
+ (30 rows)
+
+These all seem worth crawling. A couple publishers (cambridge.org), and
+persee.fr will probably fail, but not too many URLs.
+
+## Summary of Filtered Prefixes and Domains (OAI-PMH)
+
+oai:kb.dk:
+ too large and generic
+oai:bdr.oai.bsb-muenchen.de:
+ too large and generic
+oai:hispana.mcu.es:
+ too large and generic
+oai:bnf.fr:
+ too large and generic
+oai:ukm.si:
+ too large and generic
+oai:biodiversitylibrary.org:
+ redundant with other ingest and archive.org content
+oai:hsp.org:
+ large; historical content only
+oai:repec:
+ large; mostly (entirely?) links to publisher sites
+oai:n/a:
+ meta?
+oai:quod.lib.umich.edu:
+ entire issues? hard to crawl so skip for now
+oai:hypotheses.org:
+ HTML, not PDF
+oai:americanae.aecid.es:
+ large, complex. skip for now
+oai:www.irgrid.ac.cn:
+ aggregator of other IRs
+oai:espace.library.uq.edu.au:
+ large; metadata only; javascript heavy (poor heritrix crawling)
+oai:edoc.mpg.de:
+ deprecated domain, with no redirects
+oai:bibliotecadigital.jcyl.es:
+ digitized historical docs; hard to crawl, skip for now
+oai:repository.erciyes.edu.tr:
+ gone (domain lapsed)
+oai:krm.or.kr:
+ "research results repository" (metadata only)
+
+www.kb.dk
+ large, general purpose, scope
+kb-images.kb.dk
+ deprecated
+mdz-nbn-resolving.de
+ multiple prefixes end up here. historical docs, scope
+aggr.ukm.um.si
+ large, out of scope
+edoc.mpg.de
+ deprecated domain
+doaj.org
+ index (metadata only)
+orcid.org
+ out of scope
+gateway.isiknowledge.com
+ clarivate login/payall (skipping in ingest)
+
+Needs filtering to a subset of records (by 'set' or other filtering?):
+
+oai:igi.indrastra.com:
+oai:invenio.nusl.cz:
+oai:t2r2.star.titech.ac.jp:
+oai:evastar-karlsruhe.de:
+oai:repository.ust.hk:
+oai:serval.unil.ch:
+oai:pure.atira.dk:
+
+FIlters in SQL syntax:
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+
+and in some contexts (PDFs; switch to HTML):
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+## Overall Summary of OAI-PMH Stuff
+
+Big picture is that the majority of `no-pdf-link` crawl status are because of
+repository scope, record scope, or content format issues. That being said,
+there was a sizable fraction of sites which were platforms (like DSpace) which
+were not ingesting well.
+
+A significant fraction of records are "metadata only" (of papers), or non-paper
+entity types (like persons, grants, or journal titles), and a growing fraction
+(?) are metadata plus link to OA publisher fulltext (offsite). Might be
+possible to detect these at ingest time, or earlier at OAI-PMH
+harvest/transform time and filter them out.
+
+It may be worthwhile to attempt ingest of multiple existing captures
+(timestamps) in the ingest pipeline. Eg, instead of chosing a single "best"
+capture, if there are multiple HTTP 200 status captures, try ingest with each
+(or at least a couple). This is because repository software gets upgraded, so
+old "no-capture" or "not found" or "link loop" type captures may work when
+recrawled.
+
+New summary with additional filters:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 12872279
+ no-pdf-link | 9329602
+ no-capture | 4696362
+ redirect-loop | 1541458
+ terminal-bad-status | 660418
+ link-loop | 452831
+ wrong-mimetype | 434868
+ null-body | 71065
+ cdx-error | 17005
+ | 15275
+ petabox-error | 12743
+ wayback-error | 11759
+ skip-url-blocklist | 182
+ gateway-timeout | 122
+ redirects-exceeded | 120
+ bad-redirect | 117
+ bad-gzip-encoding | 111
+ wayback-content-error | 102
+ timeout | 72
+ blocked-cookie | 62
+ (20 rows)
+
diff --git a/notes/ingest/2021-09-03_daily_improvements.md b/notes/ingest/2021-09-03_daily_improvements.md
new file mode 100644
index 0000000..a0bb0c5
--- /dev/null
+++ b/notes/ingest/2021-09-03_daily_improvements.md
@@ -0,0 +1,1021 @@
+
+Periodic check-in of daily crawling/ingest.
+
+Overall ingest status, past 30 days:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------+--------
+ pdf | no-pdf-link | 158474
+ pdf | spn2-cdx-lookup-failure | 135344
+ pdf | success | 127938
+ pdf | spn2-error | 65411
+ pdf | gateway-timeout | 63112
+ pdf | blocked-cookie | 26338
+ pdf | terminal-bad-status | 24853
+ pdf | link-loop | 15699
+ pdf | spn2-error:job-failed | 13862
+ pdf | redirect-loop | 11432
+ pdf | cdx-error | 2376
+ pdf | too-many-redirects | 2186
+ pdf | wrong-mimetype | 2142
+ pdf | forbidden | 1758
+ pdf | spn2-error:no-status | 972
+ pdf | not-found | 820
+ pdf | bad-redirect | 536
+ pdf | read-timeout | 392
+ pdf | wayback-error | 251
+ pdf | remote-server-error | 220
+ (20 rows)
+
+Hrm, that is a healthy fraction of `no-pdf-link`.
+
+Broken domains, past 30 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 39678
+ osf.io | gateway-timeout | 29809
+ acervus.unicamp.br | no-pdf-link | 21978
+ osf.io | terminal-bad-status | 18727
+ zenodo.org | spn2-cdx-lookup-failure | 17008
+ doi.org | spn2-cdx-lookup-failure | 15503
+ www.degruyter.com | no-pdf-link | 15122
+ ieeexplore.ieee.org | spn2-error:job-failed | 12921
+ osf.io | spn2-cdx-lookup-failure | 11123
+ www.tandfonline.com | blocked-cookie | 8096
+ www.morressier.com | no-pdf-link | 4655
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 4580
+ pubs.acs.org | blocked-cookie | 4415
+ www.frontiersin.org | no-pdf-link | 4163
+ www.degruyter.com | spn2-cdx-lookup-failure | 3788
+ www.taylorfrancis.com | no-pdf-link | 3568
+ www.sciencedirect.com | no-pdf-link | 3128
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 3116
+ acervus.unicamp.br | spn2-cdx-lookup-failure | 2797
+ www.mdpi.com | spn2-cdx-lookup-failure | 2719
+ brill.com | link-loop | 2681
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 2657
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 2546
+ apps.crossref.org | no-pdf-link | 2537
+ onlinelibrary.wiley.com | blocked-cookie | 2528
+ (25 rows)
+
+Summary of significant domains and status, past 30 days, minus spn2-cdx-lookup-failure:
+
+ SELECT domain, status, count
+ FROM (
+ SELECT domain, status, COUNT((domain, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status != 'spn2-cdx-lookup-failure'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY CUBE (domain, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY domain ASC , count DESC;
+
+
+ domain | status | count
+ -----------------------------------------------------------------+-----------------------+--------
+ academic.oup.com | | 2405
+ academic.oup.com | no-pdf-link | 1240
+ academic.oup.com | link-loop | 1010
+ acervus.unicamp.br | | 21980
+ acervus.unicamp.br | no-pdf-link | 21978 **
+ aclanthology.org | | 208
+ acp.copernicus.org | | 365
+ acp.copernicus.org | success | 356
+ aip.scitation.org | | 1071
+ aip.scitation.org | blocked-cookie | 843
+ aip.scitation.org | redirect-loop | 227
+ apps.crossref.org | | 2537
+ apps.crossref.org | no-pdf-link | 2537
+ arxiv.org | | 17817
+ arxiv.org | success | 17370
+ arxiv.org | terminal-bad-status | 320
+ asmedigitalcollection.asme.org | | 401
+ asmedigitalcollection.asme.org | link-loop | 364
+ assets.researchsquare.com | | 3706
+ assets.researchsquare.com | success | 3706
+ avmj.journals.ekb.eg | | 605
+ avmj.journals.ekb.eg | success | 595
+ bfa.journals.ekb.eg | | 224
+ bfa.journals.ekb.eg | success | 214
+ biorxiv.org | redirect-loop | 895
+ biorxiv.org | | 895
+ birdsoftheworld.org | | 286
+ birdsoftheworld.org | no-pdf-link | 285
+ bmjopen.bmj.com | success | 232
+ bmjopen.bmj.com | | 232
+ books.openedition.org | | 396
+ books.openedition.org | no-pdf-link | 396
+ brill.com | | 4272
+ brill.com | link-loop | 2681
+ brill.com | no-pdf-link | 1410
+ cas.columbia.edu | | 1038
+ cas.columbia.edu | no-pdf-link | 1038 **
+ cdr.lib.unc.edu | | 513
+ cdr.lib.unc.edu | success | 469
+ chemrxiv.org | | 278
+ chemrxiv.org | success | 275
+ classiques-garnier.com | | 531
+ classiques-garnier.com | no-pdf-link | 487 *
+ content.iospress.com | | 275
+ content.iospress.com | link-loop | 230
+ cris.maastrichtuniversity.nl | | 318
+ cris.maastrichtuniversity.nl | success | 284
+ cyberleninka.ru | | 1165
+ cyberleninka.ru | success | 1134
+ deepblue.lib.umich.edu | | 289
+ dergipark.org.tr | | 1185
+ dergipark.org.tr | success | 774
+ dergipark.org.tr | no-pdf-link | 320
+ didaktorika.gr | | 688
+ didaktorika.gr | redirect-loop | 688
+ digi.ub.uni-heidelberg.de | | 292
+ digi.ub.uni-heidelberg.de | no-pdf-link | 292
+ direct.mit.edu | | 236
+ direct.mit.edu | no-pdf-link | 207 *
+ dl.acm.org | | 2319
+ dl.acm.org | blocked-cookie | 2230
+ dmtcs.episciences.org | | 733
+ dmtcs.episciences.org | success | 730
+ doi.ala.org.au | no-pdf-link | 2373 **
+ doi.ala.org.au | | 2373
+ doi.org | | 732
+ doi.org | terminal-bad-status | 673
+ downloads.hindawi.com | success | 1452
+ downloads.hindawi.com | | 1452
+ drive.google.com | | 216
+ drive.google.com | no-pdf-link | 211
+ dtb.bmj.com | | 674
+ dtb.bmj.com | link-loop | 669
+ easy.dans.knaw.nl | no-pdf-link | 261 *
+ easy.dans.knaw.nl | | 261
+ ebooks.marilia.unesp.br | | 688
+ ebooks.marilia.unesp.br | no-pdf-link | 688 *
+ ehp.niehs.nih.gov | | 766
+ ehp.niehs.nih.gov | blocked-cookie | 765
+ ejournal.mandalanursa.org | | 307
+ ejournal.mandalanursa.org | success | 305
+ elib.spbstu.ru | | 264
+ elib.spbstu.ru | redirect-loop | 257
+ elibrary.ru | | 1367
+ elibrary.ru | redirect-loop | 1169
+ elibrary.vdi-verlag.de | | 1251
+ elibrary.vdi-verlag.de | no-pdf-link | 646
+ elibrary.vdi-verlag.de | link-loop | 537
+ elifesciences.org | | 328
+ elifesciences.org | success | 323
+ figshare.com | | 803
+ figshare.com | no-pdf-link | 714 *
+ files.osf.io | | 745
+ files.osf.io | success | 614
+ hammer.purdue.edu | | 244
+ hammer.purdue.edu | no-pdf-link | 243
+ heiup.uni-heidelberg.de | | 277
+ heiup.uni-heidelberg.de | no-pdf-link | 268
+ hkvalidate.perfdrive.com | no-pdf-link | 370 *
+ hkvalidate.perfdrive.com | | 370
+ ieeexplore.ieee.org | | 16675
+ ieeexplore.ieee.org | spn2-error:job-failed | 12927
+ ieeexplore.ieee.org | success | 1952
+ ieeexplore.ieee.org | too-many-redirects | 1193
+ ieeexplore.ieee.org | no-pdf-link | 419
+ jamanetwork.com | | 339
+ jamanetwork.com | success | 216
+ jmstt.ntou.edu.tw | | 244
+ jmstt.ntou.edu.tw | success | 241
+ journal.ipb.ac.id | | 229
+ journal.ipb.ac.id | success | 206
+ journal.nafe.org | | 221
+ journals.aps.org | | 614
+ journals.aps.org | gateway-timeout | 495
+ journals.asm.org | | 463
+ journals.asm.org | blocked-cookie | 435
+ journals.flvc.org | | 230
+ journals.lww.com | | 1300
+ journals.lww.com | link-loop | 1284
+ journals.openedition.org | | 543
+ journals.openedition.org | success | 311
+ journals.ub.uni-heidelberg.de | | 357
+ journals.ub.uni-heidelberg.de | success | 311
+ jov.arvojournals.org | | 431
+ jov.arvojournals.org | no-pdf-link | 422 *
+ kiss.kstudy.com | | 303
+ kiss.kstudy.com | no-pdf-link | 303 *
+ library.iated.org | | 364
+ library.iated.org | redirect-loop | 264
+ library.seg.org | blocked-cookie | 301
+ library.seg.org | | 301
+ link.aps.org | redirect-loop | 442
+ link.aps.org | | 442
+ linkinghub.elsevier.com | | 515
+ linkinghub.elsevier.com | gateway-timeout | 392
+ mc.sbm.org.br | | 224
+ mc.sbm.org.br | success | 224
+ mdpi-res.com | | 742
+ mdpi-res.com | success | 742
+ mdsoar.org | | 220
+ mediarep.org | | 269
+ mediarep.org | success | 264
+ medrxiv.org | redirect-loop | 290
+ medrxiv.org | | 290
+ muse.jhu.edu | | 429
+ muse.jhu.edu | terminal-bad-status | 391
+ mvmj.journals.ekb.eg | | 306
+ oapub.org | | 292
+ oapub.org | success | 289
+ onepetro.org | | 426
+ onepetro.org | link-loop | 406
+ onlinelibrary.wiley.com | | 2835
+ onlinelibrary.wiley.com | blocked-cookie | 2531
+ onlinelibrary.wiley.com | redirect-loop | 264
+ open.library.ubc.ca | | 569
+ open.library.ubc.ca | no-pdf-link | 425 *
+ opendata.uni-halle.de | | 407
+ opendata.uni-halle.de | success | 263
+ osf.io | | 49022
+ osf.io | gateway-timeout | 29810
+ osf.io | terminal-bad-status | 18731
+ osf.io | spn2-error | 247
+ osf.io | not-found | 205
+ oxford.universitypressscholarship.com | | 392
+ oxford.universitypressscholarship.com | link-loop | 233
+ panor.ru | no-pdf-link | 433 *
+ panor.ru | | 433
+ papers.ssrn.com | | 1630
+ papers.ssrn.com | link-loop | 1598
+ pdf.sciencedirectassets.com | | 3063
+ pdf.sciencedirectassets.com | success | 3063
+ peerj.com | | 464
+ peerj.com | no-pdf-link | 303 *
+ periodicos.ufpe.br | | 245
+ periodicos.ufpe.br | success | 232
+ periodicos.unb.br | | 230
+ periodicos.unb.br | success | 221
+ preprints.jmir.org | | 548
+ preprints.jmir.org | cdx-error | 499
+ publications.rwth-aachen.de | | 213
+ publikationen.bibliothek.kit.edu | | 346
+ publikationen.bibliothek.kit.edu | success | 314
+ publikationen.uni-tuebingen.de | | 623
+ publikationen.uni-tuebingen.de | no-pdf-link | 522 *
+ publons.com | no-pdf-link | 934 *
+ publons.com | | 934
+ pubs.acs.org | | 4507
+ pubs.acs.org | blocked-cookie | 4406
+ pubs.rsc.org | | 1638
+ pubs.rsc.org | link-loop | 1054
+ pubs.rsc.org | redirect-loop | 343
+ pubs.rsc.org | success | 201
+ repositorio.ufu.br | | 637
+ repositorio.ufu.br | success | 607
+ repository.dri.ie | | 1852
+ repository.dri.ie | no-pdf-link | 1852 **
+ repository.library.brown.edu | | 293
+ repository.library.brown.edu | no-pdf-link | 291 *
+ res.mdpi.com | | 10367
+ res.mdpi.com | success | 10360
+ retrovirology.biomedcentral.com | | 230
+ revistas.ufrj.br | | 284
+ revistas.ufrj.br | success | 283
+ revistas.uptc.edu.co | | 385
+ revistas.uptc.edu.co | success | 344
+ royalsocietypublishing.org | | 231
+ rsdjournal.org | | 347
+ rsdjournal.org | success | 343
+ s3-ap-southeast-2.amazonaws.com | | 400
+ s3-ap-southeast-2.amazonaws.com | success | 392
+ s3-eu-west-1.amazonaws.com | | 2096
+ s3-eu-west-1.amazonaws.com | success | 2091
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 289
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 286
+ s3.ca-central-1.amazonaws.com | | 202
+ sage.figshare.com | | 242
+ sage.figshare.com | no-pdf-link | 241
+ sajeb.org | | 246
+ sajeb.org | no-pdf-link | 243
+ scholar.dkyobobook.co.kr | | 332
+ scholar.dkyobobook.co.kr | no-pdf-link | 328 *
+ search.mandumah.com | | 735
+ search.mandumah.com | redirect-loop | 726
+ secure.jbs.elsevierhealth.com | | 1112
+ secure.jbs.elsevierhealth.com | blocked-cookie | 1108
+ stm.bookpi.org | no-pdf-link | 468 *
+ stm.bookpi.org | | 468
+ storage.googleapis.com | | 1012
+ storage.googleapis.com | success | 1012
+ tandf.figshare.com | | 469
+ tandf.figshare.com | no-pdf-link | 466
+ teses.usp.br | | 739
+ teses.usp.br | success | 730
+ tidsskrift.dk | | 360
+ tidsskrift.dk | success | 346
+ tiedejaedistys.journal.fi | | 224
+ tind-customer-agecon.s3.amazonaws.com | success | 332
+ tind-customer-agecon.s3.amazonaws.com | | 332
+ valep.vc.univie.ac.at | no-pdf-link | 280
+ valep.vc.univie.ac.at | | 280
+ watermark.silverchair.com | | 1729
+ watermark.silverchair.com | success | 1719
+ www.academia.edu | | 387
+ www.academia.edu | no-pdf-link | 386
+ www.ahajournals.org | | 430
+ www.ahajournals.org | blocked-cookie | 413
+ www.atenaeditora.com.br | | 572
+ www.atenaeditora.com.br | terminal-bad-status | 513
+ www.atlantis-press.com | success | 722
+ www.atlantis-press.com | | 722
+ www.aup-online.com | | 419
+ www.aup-online.com | no-pdf-link | 419 *
+ www.beck-elibrary.de | | 269
+ www.beck-elibrary.de | no-pdf-link | 268 *
+ www.biodiversitylibrary.org | no-pdf-link | 528 *
+ www.biodiversitylibrary.org | | 528
+ www.bloomsburycollections.com | | 623
+ www.bloomsburycollections.com | no-pdf-link | 605 *
+ www.cabi.org | | 2191
+ www.cabi.org | no-pdf-link | 2186 *
+ www.cairn.info | | 1283
+ www.cairn.info | no-pdf-link | 713
+ www.cairn.info | link-loop | 345
+ www.cambridge.org | | 4128
+ www.cambridge.org | no-pdf-link | 1531
+ www.cambridge.org | success | 1441
+ www.cambridge.org | link-loop | 971
+ www.cureus.com | no-pdf-link | 526 *
+ www.cureus.com | | 526
+ www.dbpia.co.kr | | 637
+ www.dbpia.co.kr | redirect-loop | 631
+ www.deboni.he.com.br | | 382
+ www.deboni.he.com.br | success | 381
+ www.degruyter.com | | 17783
+ www.degruyter.com | no-pdf-link | 15102
+ www.degruyter.com | success | 2584
+ www.dovepress.com | | 480
+ www.dovepress.com | success | 472
+ www.e-manuscripta.ch | | 1350
+ www.e-manuscripta.ch | no-pdf-link | 1350 *
+ www.e-periodica.ch | | 1276
+ www.e-periodica.ch | no-pdf-link | 1275
+ www.e-rara.ch | | 202
+ www.e-rara.ch | no-pdf-link | 202
+ www.elgaronline.com | | 495
+ www.elgaronline.com | link-loop | 290
+ www.elibrary.ru | | 922
+ www.elibrary.ru | no-pdf-link | 904
+ www.emerald.com | | 2155
+ www.emerald.com | no-pdf-link | 1936 *
+ www.emerald.com | success | 219
+ www.eurekaselect.com | | 518
+ www.eurekaselect.com | no-pdf-link | 516 *
+ www.frontiersin.org | | 4163
+ www.frontiersin.org | no-pdf-link | 4162 **
+ www.hanser-elibrary.com | | 444
+ www.hanser-elibrary.com | blocked-cookie | 444
+ www.hanspub.org | | 334
+ www.hanspub.org | no-pdf-link | 314
+ www.idunn.no | | 1736
+ www.idunn.no | link-loop | 596
+ www.idunn.no | success | 577
+ www.idunn.no | no-pdf-link | 539
+ www.igi-global.com | terminal-bad-status | 458
+ www.igi-global.com | | 458
+ www.ijcai.org | | 533
+ www.ijcai.org | success | 532
+ www.ijraset.com | success | 385
+ www.ijraset.com | | 385
+ www.inderscience.com | | 712
+ www.inderscience.com | no-pdf-link | 605 *
+ www.ingentaconnect.com | | 456
+ www.ingentaconnect.com | no-pdf-link | 413 *
+ www.internationaljournalssrg.org | | 305
+ www.internationaljournalssrg.org | no-pdf-link | 305 *
+ www.isca-speech.org | | 2392
+ www.isca-speech.org | no-pdf-link | 2391 **
+ www.journals.uchicago.edu | | 228
+ www.journals.uchicago.edu | blocked-cookie | 227
+ www.jstage.jst.go.jp | | 1492
+ www.jstage.jst.go.jp | success | 1185
+ www.jstage.jst.go.jp | no-pdf-link | 289
+ www.jstor.org | | 301
+ www.jurology.com | | 887
+ www.jurology.com | redirect-loop | 887
+ www.karger.com | | 318
+ www.liebertpub.com | | 507
+ www.liebertpub.com | blocked-cookie | 496
+ www.morressier.com | | 4781
+ www.morressier.com | no-pdf-link | 4655 **
+ www.ncl.ecu.edu | | 413
+ www.ncl.ecu.edu | success | 413
+ www.nomos-elibrary.de | | 526
+ www.nomos-elibrary.de | no-pdf-link | 391
+ www.oecd-ilibrary.org | no-pdf-link | 1170 **
+ www.oecd-ilibrary.org | | 1170
+ www.openagrar.de | no-pdf-link | 221
+ www.openagrar.de | | 221
+ www.osapublishing.org | | 900
+ www.osapublishing.org | link-loop | 615
+ www.osapublishing.org | no-pdf-link | 269
+ www.osti.gov | | 630
+ www.osti.gov | link-loop | 573
+ www.oxfordlawtrove.com | no-pdf-link | 476 *
+ www.oxfordlawtrove.com | | 476
+ www.pdcnet.org | | 298
+ www.pdcnet.org | terminal-bad-status | 262
+ www.pedocs.de | | 203
+ www.pnas.org | | 222
+ www.preprints.org | | 372
+ www.preprints.org | success | 366
+ www.repository.cam.ac.uk | | 801
+ www.repository.cam.ac.uk | success | 359
+ www.repository.cam.ac.uk | no-pdf-link | 239
+ www.research-collection.ethz.ch | | 276
+ www.research-collection.ethz.ch | terminal-bad-status | 274
+ www.revistas.usp.br | | 207
+ www.revistas.usp.br | success | 204
+ www.rina.org.uk | no-pdf-link | 1009 **
+ www.rina.org.uk | | 1009
+ www.schweizerbart.de | no-pdf-link | 202
+ www.schweizerbart.de | | 202
+ www.scielo.br | | 544
+ www.scielo.br | redirect-loop | 526
+ www.sciencedirect.com | | 3901
+ www.sciencedirect.com | no-pdf-link | 3127 **
+ www.sciencedirect.com | link-loop | 701
+ www.sciendo.com | | 384
+ www.sciendo.com | success | 363
+ www.sciengine.com | | 225
+ www.scirp.org | | 209
+ www.spandidos-publications.com | | 205
+ www.tandfonline.com | | 8925
+ www.tandfonline.com | blocked-cookie | 8099
+ www.tandfonline.com | terminal-bad-status | 477
+ www.tandfonline.com | redirect-loop | 322
+ www.taylorfrancis.com | | 6119
+ www.taylorfrancis.com | no-pdf-link | 3567
+ www.taylorfrancis.com | link-loop | 2169
+ www.taylorfrancis.com | terminal-bad-status | 353
+ www.thieme-connect.de | | 1047
+ www.thieme-connect.de | redirect-loop | 472
+ www.thieme-connect.de | spn2-error:job-failed | 343
+ www.tib.eu | | 206
+ www.trp.org.in | | 311
+ www.trp.org.in | success | 311
+ www.un-ilibrary.org | no-pdf-link | 597 *
+ www.un-ilibrary.org | | 597
+ www.vr-elibrary.de | | 775
+ www.vr-elibrary.de | blocked-cookie | 774
+ www.wjgnet.com | | 204
+ www.wjgnet.com | no-pdf-link | 204
+ www.worldscientific.com | | 974
+ www.worldscientific.com | blocked-cookie | 971
+ www.worldwidejournals.com | | 242
+ www.worldwidejournals.com | no-pdf-link | 203
+ www.wto-ilibrary.org | no-pdf-link | 295
+ www.wto-ilibrary.org | | 295
+ www.zora.uzh.ch | | 222
+ zenodo.org | | 49460
+ zenodo.org | no-pdf-link | 39721
+ zenodo.org | success | 8954
+ zenodo.org | wrong-mimetype | 562
+ | | 445919
+ | no-pdf-link | 168035
+ | success | 140875
+ | gateway-timeout | 31809
+ | blocked-cookie | 26431
+ | terminal-bad-status | 25625
+ | link-loop | 19006
+ | spn2-error:job-failed | 13962
+ | redirect-loop | 12512
+ | wrong-mimetype | 2302
+ | spn2-error | 1689
+ | too-many-redirects | 1203
+ | bad-redirect | 732
+ | cdx-error | 539
+ | not-found | 420
+ | spn2-error:no-status | 256
+ (419 rows)
+
+Get random subsets by terminal domain:
+
+ \x auto
+ SELECT
+ ingest_request.link_source_id AS link_source_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%//DOMAIN/%'
+ ORDER BY random()
+ LIMIT 5;
+
+## acervus.unicamp.br
+
+Previously flagged as messy (2021-05_daily_improvements.md)
+
+## cas.columbia.edu
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-2ety-qm51
+base_url | https://doi.org/10.7916/d8-2ety-qm51
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-0zf6-d167
+base_url | https://doi.org/10.7916/d8-0zf6-d167
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-k6ha-sn43
+base_url | https://doi.org/10.7916/d8-k6ha-sn43
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-bj6t-eb07
+base_url | https://doi.org/10.7916/d8-bj6t-eb07
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-xjac-j502
+base_url | https://doi.org/10.7916/d8-xjac-j502
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+
+these are not public (loginwalls)
+
+DONE: '/login?TARGET=' as a login wall pattern
+
+## doi.ala.org.au
+
+Previously flagged as dataset repository; datacite metadata is wrong. (2021-05_daily_improvements.md)
+
+NOTE: look at ingesting datasets
+
+## www.isca-speech.org
+
+-[ RECORD 1 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2014-84
+base_url | https://doi.org/10.21437/interspeech.2014-84
+terminal_url | https://www.isca-speech.org/archive/interspeech_2014/li14b_interspeech.html
+-[ RECORD 2 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2004-319
+base_url | https://doi.org/10.21437/interspeech.2004-319
+terminal_url | https://www.isca-speech.org/archive/interspeech_2004/delcroix04_interspeech.html
+-[ RECORD 3 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-372
+base_url | https://doi.org/10.21437/interspeech.2006-372
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/lei06c_interspeech.html
+-[ RECORD 4 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2015-588
+base_url | https://doi.org/10.21437/interspeech.2015-588
+terminal_url | https://www.isca-speech.org/archive/interspeech_2015/polzehl15b_interspeech.html
+-[ RECORD 5 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-468
+base_url | https://doi.org/10.21437/interspeech.2006-468
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html
+
+Bespoke site. Added rule to sandcrawler.
+
+NOTE: re-ingest/recrawl all isca-speech.org no-pdf-link terminal URLs (fatcat-ingest?)
+
+## www.morressier.com
+
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0002858v
+base_url | https://doi.org/10.1115/1.0002858v
+terminal_url | https://www.morressier.com/article/development-new-single-highdensity-heatflux-gauges-unsteady-heat-transfer-measurements-rotating-transonic-turbine/60f162805d86378f03b49af5
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0003896v
+base_url | https://doi.org/10.1115/1.0003896v
+terminal_url | https://www.morressier.com/article/experimental-investigation-proton-exchange-membrane-fuel-cell-platinum-nafion-along-inplane-direction/60f16d555d86378f03b50038
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0004476v
+base_url | https://doi.org/10.1115/1.0004476v
+terminal_url | https://www.morressier.com/article/effect-air-release-agents-performance-results-fabric-lined-bushings/60f16d585d86378f03b502d5
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0001286v
+base_url | https://doi.org/10.1115/1.0001286v
+terminal_url | https://www.morressier.com/article/development-verification-modelling-practice-cfd-calculations-obtain-current-loads-fpso/60f15d3fe537565438d70ece
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0000315v
+base_url | https://doi.org/10.1115/1.0000315v
+terminal_url | https://www.morressier.com/article/fire-event-analysis-fire-frequency-estimation-japanese-nuclear-power-plant/60f15a6f5d86378f03b43874
+
+Many of these seem to be presentations, as both video and slides. PDFs seem broken though.
+
+NOTE: add to list of interesting rich media to crawl/preserve (video+slides+data)
+
+## www.oecd-ilibrary.org
+
+Paywall (2021-05_daily_improvements.md)
+
+## www.rina.org.uk
+
+-[ RECORD 1 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.ws.2002.10
+base_url | https://doi.org/10.3940/rina.ws.2002.10
+terminal_url | https://www.rina.org.uk/showproducts.html?product=4116
+-[ RECORD 2 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.pass.2003.16
+base_url | https://doi.org/10.3940/rina.pass.2003.16
+terminal_url | https://www.rina.org.uk/showproducts.html?product=3566
+-[ RECORD 3 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin.2013.15
+base_url | https://doi.org/10.3940/rina.icsotin.2013.15
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8017
+-[ RECORD 4 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.wfa.2010.23
+base_url | https://doi.org/10.3940/rina.wfa.2010.23
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8177
+-[ RECORD 5 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin15.2015.01
+base_url | https://doi.org/10.3940/rina.icsotin15.2015.01
+terminal_url | https://www.rina.org.uk/showproducts.html?product=7883
+
+Site is broken in some way
+
+## www.sciencedirect.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.jhlste.2021.100332
+base_url | https://doi.org/10.1016/j.jhlste.2021.100332
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S1473837621000332
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.hazadv.2021.100006
+base_url | https://doi.org/10.1016/j.hazadv.2021.100006
+terminal_url | https://www.sciencedirect.com/science/article/pii/S2772416621000061/pdfft?md5=e51bfd495bb53073c7a379d25cb11a32&pid=1-s2.0-S2772416621000061-main.pdf
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-12-822844-9.00009-8
+base_url | https://doi.org/10.1016/b978-0-12-822844-9.00009-8
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780128228449000098
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.colcom.2021.100490
+base_url | https://doi.org/10.1016/j.colcom.2021.100490
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S2215038221001308
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-323-85245-6.00012-6
+base_url | https://doi.org/10.1016/b978-0-323-85245-6.00012-6
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780323852456000126
+
+These no-pdf-url ones seem to just be not OA, which is expected for much of the
+domain.
+
+## repository.dri.ie
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+---------------------------------------------
+ 10.7486/dri.t148v5941 | https://doi.org/10.7486/dri.t148v5941 | https://repository.dri.ie/catalog/t148v5941
+ 10.7486/dri.2z119c98f | https://doi.org/10.7486/dri.2z119c98f | https://repository.dri.ie/catalog/2z119c98f
+ 10.7486/dri.qf8621102 | https://doi.org/10.7486/dri.qf8621102 | https://repository.dri.ie/catalog/qf8621102
+ 10.7486/dri.js95m457t | https://doi.org/10.7486/dri.js95m457t | https://repository.dri.ie/catalog/js95m457t
+ 10.7486/dri.c534vb726 | https://doi.org/10.7486/dri.c534vb726 | https://repository.dri.ie/catalog/c534vb726
+
+"Digital repository of Ireland"
+
+Historical scanned content. Bespoke site. Fixed.
+
+NOTE: recrawl/retry this domain
+
+## www.frontiersin.org
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/978-2-88971-147-5
+base_url | https://doi.org/10.3389/978-2-88971-147-5
+terminal_url | https://www.frontiersin.org/research-topics/9081/neuroimaging-approaches-to-the-study-of-tinnitus-and-hyperacusis
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fnins.2021.722592
+base_url | https://doi.org/10.3389/fnins.2021.722592
+terminal_url | https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fcell.2021.683209
+base_url | https://doi.org/10.3389/fcell.2021.683209
+terminal_url | https://www.frontiersin.org/articles/10.3389/fcell.2021.683209/full
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fmicb.2021.692474
+base_url | https://doi.org/10.3389/fmicb.2021.692474
+terminal_url | https://www.frontiersin.org/articles/10.3389/fmicb.2021.692474/full
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fneur.2021.676527
+base_url | https://doi.org/10.3389/fneur.2021.676527
+terminal_url | https://www.frontiersin.org/articles/10.3389/fneur.2021.676527/full
+
+All the `/research-topics/` URLs are out of scope.
+
+NOTE: recrawl missing frontiersin.org articles for PDFs
+NOTE: recrawl missing frontiersin.org articles for XML (?)
+
+-------
+
+## direct.mit.edu
+
+Previously "not available" (2021-05_daily_improvements.md)
+
+## figshare.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15052236.v6
+base_url | https://doi.org/10.6084/m9.figshare.15052236.v6
+terminal_url | https://figshare.com/articles/software/RCL-tree_rar/15052236/6
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.14907846.v5
+base_url | https://doi.org/10.6084/m9.figshare.14907846.v5
+terminal_url | https://figshare.com/articles/book/Conservation_of_Limestone_Ecosystems_of_Malaysia_Part_I_Acknowledgements_Methodology_Overview_of_limestone_outcrops_in_Malaysia_References_Detailed_information_on_limestone_outcrops_of_the_states_Johor_Negeri_Sembilan_Terengganu_Selangor_Pe/14907846/5
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15157614.v1
+base_url | https://doi.org/10.6084/m9.figshare.15157614.v1
+terminal_url | https://figshare.com/articles/software/code_for_NN-A72265C/15157614/1
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15172926.v1
+base_url | https://doi.org/10.6084/m9.figshare.15172926.v1
+terminal_url | https://figshare.com/articles/preprint/History_of_the_internet/15172926/1
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.16532574.v1
+base_url | https://doi.org/10.6084/m9.figshare.16532574.v1
+terminal_url | https://figshare.com/articles/media/Helen_McConnell_How_many_trees_do_you_think_you_have_planted_/16532574/1
+
+NOTE: can determine from the redirect URL, I guess. This is helpful for ingest!
+Could also potentially correct fatcat release_type using this info.
+
+We seem to be getting the ones we can (eg, papers) just fine
+
+## hkvalidate.perfdrive.com
+
+Should be skipping/bailing on this domain, but not for some reason.
+
+-[ RECORD 1 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05cc
+base_url | https://doi.org/10.3847/1538-4357/ac05cc
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=1716a049-aeaa-4a89-8f82-bd733adaa2e7&ssb=43981203877&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05cc&ssi=0774dd12-8427-4e27-a2ac-759c8cc2ec0e&ssk=support@shieldsquare.com&ssm=07370915269044035109047683305266&ssn=e69c743cc3d66619f960f924b562160d637e8d7f1b0f-d3bb-44d4-b075ed&sso=75a8bd85-4a097fb40f99bfb9c97b0a4ca0a38fd6d79513a466e82cc7&ssp=92054607321628531005162856888275586&ssq=33809984098158010864140981653938424553916&ssr=MjA3LjI0MS4yMjUuMTM5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 2 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac0429
+base_url | https://doi.org/10.3847/1538-4357/ac0429
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=12bca70d-0af4-4241-9c9b-384befd96a88&ssb=92559232428&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac0429&ssi=cff72ab0-8427-4acd-a0e7-db1b04cf7ce7&ssk=support@shieldsquare.com&ssm=27895673282814430105287068829605&ssn=9af36a8e10efd239c9367a2f31dde500f7455c4d5f45-bf11-4b99-ad29ea&sso=26bd22d2-b23e1bd9558f2fd9ed0768ef1acecb24715d1d463328a229&ssp=16502500621628222613162823304820671&ssq=11469693950387070477339503456478590533604&ssr=MjA3LjI0MS4yMjUuMTYw&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 3 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1149/1945-7111/ac1a85
+base_url | https://doi.org/10.1149/1945-7111/ac1a85
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=b0fef51a-0f44-476e-b951-3341bde6aa67&ssb=84929220393&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1149%2F1945-7111%2Fac1a85&ssi=48c05577-8427-4421-acd3-735ca29a46e6&ssk=support@shieldsquare.com&ssm=81129482524077974103852241068134&ssn=cf6c261d2b20d518b2ebe57e40ffaec9ab4cd1955dcb-7877-4f5b-bc3b1e&sso=1d196cae-6850f1ed8143e460f2bfbb61a8ae15cfe6b53d3bcdc528ca&ssp=99289867941628195224162819241830491&ssq=16897595632212421273956322948987630170313&ssr=MjA3LjI0MS4yMjUuMjM2&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 4 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.35848/1882-0786/ac1b0d
+base_url | https://doi.org/10.35848/1882-0786/ac1b0d
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=6debdd23-c46b-4b40-b73c-d5540f04454e&ssb=95627212532&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.35848%2F1882-0786%2Fac1b0d&ssi=78b34ff9-8427-4d07-a0db-78a3aa2c7332&ssk=support@shieldsquare.com&ssm=54055111549093989106852695053789&ssn=cb51949e15a02cb99a8d0b57c4d06327b72e8d5c87a8-d006-4ffa-939ffb&sso=1b7fd62d-8107746fe28fca252fd45ffa403937e272bf75b452b68d4a&ssp=77377533171628212164162820021422494&ssq=02679025218797637682252187852000657274192&ssr=MjA3LjI0MS4yMzMuMTIx&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 5 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05ba
+base_url | https://doi.org/10.3847/1538-4357/ac05ba
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=f127eb3d-6a05-459d-97f2-499715c04b13&ssb=06802230353&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05ba&ssi=8d087719-8427-4046-91fb-5e96af401560&ssk=support@shieldsquare.com&ssm=21056861072205974105064006574997&ssn=d05a73cff6d9af57acd6e2c366e716176752e1164d39-b9a7-408c-837d11&sso=d3f38d1e-a562a19195042d7e471a5e4fab03b6ca16ff1711c7c61804&ssp=68781137401628744693162877909483738&ssq=79454859841502433261398415426689546750534&ssr=MjA3LjI0MS4yMzIuMTg5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+
+Was failing to check against blocklist again at the end of attempts.
+
+Could retry all these to update status, but probably not worth it.
+
+## jov.arvojournals.org
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+-------------------------------------------------------------
+ 10.1167/jov.21.9.1933 | https://doi.org/10.1167/jov.21.9.1933 | https://jov.arvojournals.org/article.aspx?articleid=2777021
+ 10.1167/jov.21.9.2910 | https://doi.org/10.1167/jov.21.9.2910 | https://jov.arvojournals.org/article.aspx?articleid=2777561
+ 10.1167/jov.21.9.1895 | https://doi.org/10.1167/jov.21.9.1895 | https://jov.arvojournals.org/article.aspx?articleid=2777057
+ 10.1167/jov.21.9.2662 | https://doi.org/10.1167/jov.21.9.2662 | https://jov.arvojournals.org/article.aspx?articleid=2777793
+ 10.1167/jov.21.9.2246 | https://doi.org/10.1167/jov.21.9.2246 | https://jov.arvojournals.org/article.aspx?articleid=2777441
+
+These seem to just not be published/available yet.
+
+But they also use watermark.silverchair.com
+
+NOTE: re-crawl (force-retry?) all non-recent papers with fatcat-ingest
+NOTE: for watermark.silverchair.com terminal bad-status, re-crawl from initial URL (base_url) using heritrix
+
+## kiss.kstudy.com
+
+Previously unable to download (2021-05_daily_improvements.md)
+
+## open.library.ubc.ca
+
+ link_source_id | base_url | terminal_url
+--------------------+------------------------------------+----------------------------------------------------------------------------------
+ 10.14288/1.0400664 | https://doi.org/10.14288/1.0400664 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400664
+ 10.14288/1.0401189 | https://doi.org/10.14288/1.0401189 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401189
+ 10.14288/1.0401487 | https://doi.org/10.14288/1.0401487 | https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+ 10.14288/1.0400994 | https://doi.org/10.14288/1.0400994 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400994
+ 10.14288/1.0401312 | https://doi.org/10.14288/1.0401312 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401312
+
+Historical newspapers, out of scope?
+
+Video content:
+https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+
+Another video: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+
+NOTE: add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+NOTE: handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+
+
+## panor.ru
+
+ link_source_id | base_url | terminal_url
+-------------------------+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 10.33920/med-14-2108-06 | https://doi.org/10.33920/med-14-2108-06 | https://panor.ru/articles/otsenka-dinamiki-pokazateley-morfofunktsionalnykh-kharakteristik-kozhi-upatsientov-s-spr-pod-vliyaniem-kompleksnoy-fototerapii/66351.html
+ 10.33920/nik-02-2105-01 | https://doi.org/10.33920/nik-02-2105-01 | https://panor.ru/articles/innovatsionnost-obrazovatelnykh-tekhnologiy-kak-istoricheski-oposredovannyy-fenomen/65995.html
+ 10.33920/pro-1-2101-10 | https://doi.org/10.33920/pro-1-2101-10 | https://panor.ru/articles/obespechenie-bezopasnosti-na-promyshlennykh-predpriyatiyakh-s-pomoshchyu-sredstv-individualnoy-zashchity/66299.html
+ 10.33920/sel-4-2008-04 | https://doi.org/10.33920/sel-4-2008-04 | https://panor.ru/articles/osobennosti-regulirovaniya-zemelnykh-otnosheniy-na-prigranichnykh-territoriyakh-rossiyskoy-federatsii/66541.html
+ 10.33920/pro-2-2104-03 | https://doi.org/10.33920/pro-2-2104-03 | https://panor.ru/articles/organizatsiya-samorazvivayushchegosya-proizvodstva-v-realnykh-usloviyakh/65054.html
+
+"The full version of the article is available only to subscribers of the journal"
+
+Paywall
+
+## peerj.com
+
+Previously: this is HTML of reviews (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope
+
+## publons.com
+
+Previously: this is HTML (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope (length of works)
+
+## stm.bookpi.org
+
+ link_source_id | base_url | terminal_url
+-----------------------------+---------------------------------------------+----------------------------------------------------
+ 10.9734/bpi/nfmmr/v7/11547d | https://doi.org/10.9734/bpi/nfmmr/v7/11547d | https://stm.bookpi.org/NFMMR-V7/article/view/3231
+ 10.9734/bpi/ecafs/v1/9773d | https://doi.org/10.9734/bpi/ecafs/v1/9773d | https://stm.bookpi.org/ECAFS-V1/article/view/3096
+ 10.9734/bpi/mpebm/v5/3391f | https://doi.org/10.9734/bpi/mpebm/v5/3391f | https://stm.bookpi.org/MPEBM-V5/article/view/3330
+ 10.9734/bpi/castr/v13/3282f | https://doi.org/10.9734/bpi/castr/v13/3282f | https://stm.bookpi.org/CASTR-V13/article/view/2810
+ 10.9734/bpi/hmms/v13 | https://doi.org/10.9734/bpi/hmms/v13 | https://stm.bookpi.org/HMMS-V13/issue/view/274
+
+These are... just abstracts of articles within a book? Weird. Maybe sketchy? DOIs via Crossref
+
+## www.cabi.org
+
+ link_source_id | base_url | terminal_url
+--------------------------+------------------------------------------+----------------------------------------------------
+ 10.1079/dfb/20133414742 | https://doi.org/10.1079/dfb/20133414742 | https://www.cabi.org/cabreviews/review/20133414742
+ 10.1079/dmpd/20056500471 | https://doi.org/10.1079/dmpd/20056500471 | https://www.cabi.org/cabreviews/review/20056500471
+ 10.1079/dmpp/20056600544 | https://doi.org/10.1079/dmpp/20056600544 | https://www.cabi.org/cabreviews/review/20056600544
+ 10.1079/dmpd/20056500117 | https://doi.org/10.1079/dmpd/20056500117 | https://www.cabi.org/cabreviews/review/20056500117
+ 10.1079/dmpp20056600337 | https://doi.org/10.1079/dmpp20056600337 | https://www.cabi.org/cabreviews/review/20056600337
+
+Reviews? but just abstracts?
+
+## www.cureus.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17547
+base_url | https://doi.org/10.7759/cureus.17547
+terminal_url | https://www.cureus.com/articles/69542-tramadol-induced-jerks
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16867
+base_url | https://doi.org/10.7759/cureus.16867
+terminal_url | https://www.cureus.com/articles/66793-advanced-squamous-cell-carcinoma-of-gall-bladder-masquerading-as-liver-abscess-with-review-of-literature-review-on-advanced-biliary-tract-cancer
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17425
+base_url | https://doi.org/10.7759/cureus.17425
+terminal_url | https://www.cureus.com/articles/67438-attitudes-and-knowledge-of-medical-students-towards-healthcare-for-lesbian-gay-bisexual-and-transgender-seniors-impact-of-a-case-based-discussion-with-facilitators-from-the-community
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17313
+base_url | https://doi.org/10.7759/cureus.17313
+terminal_url | https://www.cureus.com/articles/67258-utilizing-google-trends-to-track-online-interest-in-elective-hand-surgery-during-the-covid-19-pandemic
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16943
+base_url | https://doi.org/10.7759/cureus.16943
+terminal_url | https://www.cureus.com/articles/19364-small-bowel-obstruction-a-rare-presentation-of-the-inferior-pancreaticoduodenal-artery-pseudoaneurysm-bleed
+
+Ugh, stupid "email to get PDF". but ingest seems to work anyways?
+
+NOTE: re-crawl/re-ingest all (eg, fatcat-ingest or similar)
+
+## www.e-manuscripta.ch
+
+ link_source_id | base_url | terminal_url
+------------------------------+----------------------------------------------+-------------------------------------------------------------------
+ 10.7891/e-manuscripta-114031 | https://doi.org/10.7891/e-manuscripta-114031 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114031
+ 10.7891/e-manuscripta-112064 | https://doi.org/10.7891/e-manuscripta-112064 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112064
+ 10.7891/e-manuscripta-112176 | https://doi.org/10.7891/e-manuscripta-112176 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176
+ 10.7891/e-manuscripta-115200 | https://doi.org/10.7891/e-manuscripta-115200 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-115200
+ 10.7891/e-manuscripta-114008 | https://doi.org/10.7891/e-manuscripta-114008 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114008
+
+Historical docs, single pages, but do have full PDF downloads.
+
+NOTE: re-ingest
+
+## www.inderscience.com
+
+Previously: paywall (2021-05_daily_improvements.md)
+
+## www.un-ilibrary.org
+
+ link_source_id | base_url | terminal_url
+----------------------------+--------------------------------------------+-------------------------------------------------------------
+ 10.18356/9789210550307 | https://doi.org/10.18356/9789210550307 | https://www.un-ilibrary.org/content/books/9789210550307
+ 10.18356/9789210586719c011 | https://doi.org/10.18356/9789210586719c011 | https://www.un-ilibrary.org/content/books/9789210586719c011
+ 10.18356/9789210058575c014 | https://doi.org/10.18356/9789210058575c014 | https://www.un-ilibrary.org/content/books/9789210058575c014
+ 10.18356/9789210550307c020 | https://doi.org/10.18356/9789210550307c020 | https://www.un-ilibrary.org/content/books/9789210550307c020
+ 10.18356/9789213631423c005 | https://doi.org/10.18356/9789213631423c005 | https://www.un-ilibrary.org/content/books/9789213631423c005
+
+Books and chapters. Doesn't seem to have actual download ability?
+
+# Re-Ingest / Re-Crawl
+
+Using fatcat-ingest helper tool.
+
+- www.isca-speech.org doi_prefix:10.21437
+ doi:* doi_prefix:10.21437 in_ia:false
+ 9,233
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.21437' > /srv/fatcat/tasks/2021-09-03_ingest_isca.json
+ => Counter({'ingest_request': 9221, 'elasticsearch_release': 9221, 'estimate': 9221})
+- repository.dri.ie doi_prefix:10.7486
+ doi:* in_ia:false doi_prefix:10.7486
+ 56,532
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.7486' > /srv/fatcat/tasks/2021-09-03_ingest_dri.json
+ => Counter({'ingest_request': 56532, 'elasticsearch_release': 56532, 'estimate': 56532})
+- *.arvojournals.org doi_prefix:10.1167 (force recrawl if no-pdf-link)
+ 25,598
+ many are meeting abstracts
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.1167 > /srv/fatcat/tasks/2021-09-03_ingest_arvo.json
+ => Counter({'ingest_request': 25598, 'elasticsearch_release': 25598, 'estimate': 25598})
+- www.cureus.com doi_prefix:10.7759
+ 1,537
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.7759 > /srv/fatcat/tasks/2021-09-03_ingest_cureus.json
+ => Counter({'ingest_request': 1535, 'elasticsearch_release': 1535, 'estimate': 1535})
+- www.e-manuscripta.ch doi_prefix:10.7891 10.7891/e-manuscripta
+ 110,945
+ TODO: all are marked 'unpublished', but that is actually probably right?
+- www.frontiersin.org doi_prefix:10.3389 (both PDF and XML!)
+ doi:* in_ia:false doi_prefix:10.3389
+ 212,370
+ doi:10.3389/conf.* => most seem to be just abstracts? how many like this?
+ container_id:kecnf6vtpngn7j2avgfpdyw5ym => "topics" (2.2k)
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 191k
+ but many might be components? this is actually kind of a mess
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 19.2k
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' | rg -v 10.3389/conf > /srv/fatcat/tasks/2021-09-03_frontiers.json
+
+# Remaining Tasks / Domains (TODO)
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md
new file mode 100644
index 0000000..d36f427
--- /dev/null
+++ b/notes/ingest/2021-09-03_patch_crawl.md
@@ -0,0 +1,678 @@
+
+Going to run a combined crawl for `no-capture`, `no-pdf-link` and similar URL
+statuses.
+
+As a reminder, significant refactor of PDF URL extraction happened around
+Oct/Nov 2020, so things not re-ingested since then should be retried.
+
+1. first bulk re-process `no-pdf-link` statuses from OAI-PMH crawl past OA DOI past crawls
+2. then heritrix crawl of old URLs from all sources (see status codes below)
+3. bulk ingest specific sources and statuses (see below)
+
+Status codes to crawl, with potentially split separate batches:
+
+ no-capture
+ IA errors
+ cdx-error
+ wayback-error
+ wayback-content-error
+ petabox-error
+ spn2-cdx-lookup-failure
+ gateway-timeout
+
+Then, bulk ingest from these sources matching the above patterns, in this order:
+
+- OA DOI (fatcat-ingest or fatcat-changelog source; will result in import)
+- unpaywall (will result in import)
+- OAI-PMH
+- MAG
+
+Current combined domain skip list (SQL filter syntax), for which we don't want
+to bother retrying:
+
+ '%journals.sagepub.com%'
+ '%pubs.acs.org%'
+ '%ahajournals.org%'
+ '%www.journal.csj.jp%'
+ '%aip.scitation.org%'
+ '%academic.oup.com%'
+ '%tandfonline.com%'
+ '%://orcid.org/%'
+ '%://doaj.org/%'
+ '%://archive.org/%'
+ '%://web.archive.org/%'
+ '%://www.archive.org/%'
+
+## DOI Ingest Status (2021-09-08)
+
+Recently did some analysis of OAI-PMH overall status, so can re-do comparisons
+there easily. What about overall DOI ingest? Would like counts so we can
+compare before/after.
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------+----------
+ no-pdf-link | 10516478
+ success | 5690862
+ redirect-loop | 1827192
+ no-capture | 1215179
+ terminal-bad-status | 650104
+ link-loop | 610251
+ blocked-cookie | 353681
+ gateway-timeout | 341319
+ too-many-redirects | 307895
+ forbidden | 306710
+ spn2-cdx-lookup-failure | 282955
+ not-found | 273667
+ cdx-error | 269082
+ skip-url-blocklist | 265689
+ spn2-error | 87759
+ wrong-mimetype | 68993
+ spn2-error:too-many-redirects | 58064
+ wayback-error | 54152
+ spn2-wayback-error | 51752
+ remote-server-error | 45683
+ (20 rows)
+
+## `no-pdf-link` re-try bulk ingest
+
+Specifically for past OAI-PMH and OA DOI crawls.
+
+What are top terminal domains that would be retried? So that we can filter out
+large ones we don't want to bother retrying.
+
+ SELECT domain, COUNT(domain)
+ FROM (
+ SELECT
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 40;
+
+ domain | count
+ ---------------------------------------+--------
+ ssl.fao.org | 862277
+ www.e-periodica.ch | 828110
+ zenodo.org | 686701
+ plutof.ut.ee | 685440
+ www.gbif.org | 669727
+ dlc.library.columbia.edu | 536018
+ figshare.com | 383181
+ juser.fz-juelich.de | 351519
+ statisticaldatasets.data-planet.com | 320415
+ espace.library.uq.edu.au | 310767
+ invenio.nusl.cz | 309731
+ doi.pangaea.de | 306311
+ igi.indrastra.com | 297872
+ bib-pubdb1.desy.de | 273565
+ t2r2.star.titech.ac.jp | 271907
+ digi.ub.uni-heidelberg.de | 265519
+ www.sciencedirect.com | 263847
+ publikationen.bibliothek.kit.edu | 229960
+ www.plate-archive.org | 209231
+ www.degruyter.com | 189776
+ spectradspace.lib.imperial.ac.uk:8443 | 187086
+ hal.archives-ouvertes.fr | 185513
+ open.library.ubc.ca | 172821
+ lup.lub.lu.se | 170063
+ books.openedition.org | 169501
+ orbi.uliege.be | 161443
+ freidok.uni-freiburg.de | 150310
+ library.wur.nl | 124318
+ digital.library.pitt.edu | 116406
+ www.research.manchester.ac.uk | 115869
+ www.bibliotecavirtualdeandalucia.es | 114527
+ repository.tue.nl | 112157
+ www.google.com | 111569
+ easy.dans.knaw.nl | 109608
+ springernature.figshare.com | 108597
+ nbn-resolving.org | 107544
+ scholarbank.nus.edu.sg | 107299
+ bibliotecavirtualdefensa.es | 105501
+ biblio.ugent.be | 100854
+ ruj.uj.edu.pl | 99500
+ (40 rows)
+
+For a number of these domains, we do not expect any PDFs to be found, but are
+going to re-ingest anyways so they get marked as 'blocked-*' in result table:
+
+- ssl.fao.org
+- plutof.ut.ee
+- www.gbif.org
+
+But some we are just going to skip anyways, because there *could* be PDFs, but
+probably *aren't*:
+
+- zenodo.org
+- t2r2.star.titech.ac.jp
+- www.google.com
+- figshare.com
+- springernature.figshare.com
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json';
+ => COPY 18040676
+
+Transform and start ingest:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json
+ => 18.0M 0:06:45 [44.5k/s]
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ => DONE
+
+## Progress Check
+
+OAI-PMH query:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 13258356
+ no-pdf-link | 8685519
+ no-capture | 4765663
+ redirect-loop | 1557731
+ terminal-bad-status | 803373
+ link-loop | 453999
+ wrong-mimetype | 440230
+ null-body | 71457
+ cdx-error | 18426
+ | 15275
+ petabox-error | 13408
+ wayback-error | 11845
+ blocked-cookie | 11580
+ skip-url-blocklist | 7761
+ wayback-content-error | 383
+ spn2-cdx-lookup-failure | 362
+ gateway-timeout | 320
+ body-too-large | 207
+ spn2-error:job-failed | 191
+ redirects-exceeded | 120
+ (20 rows)
+
+OAI-PMH compared to a couple weeks ago:
+
+ 13258356-12872279 = +386,077 success
+ 8685519-9329602 = -644,083 no-pdf-link
+ 4765663-4696362 = +69,301 no-capture
+ 803373-660418 = +142,955 terminal-bad-status
+
+OA DOI ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------+---------
+ no-pdf-link | 6693547
+ success | 5979016
+ skip-url-blocklist | 3080986
+ no-capture | 1876914
+ redirect-loop | 1872817
+ terminal-bad-status | 656674
+ link-loop | 624290
+ blocked-cookie | 448001
+ gateway-timeout | 351896
+ too-many-redirects | 307895
+ forbidden | 306710
+ spn2-cdx-lookup-failure | 301312
+ cdx-error | 279766
+ not-found | 273667
+ wrong-mimetype | 83289
+ spn2-error | 76806
+ spn2-error:too-many-redirects | 58064
+ wayback-error | 54278
+ spn2-wayback-error | 51768
+ remote-server-error | 45683
+ (20 rows)
+
+OA DOI changes:
+
+ 5979016-5690862 = +288,154 success
+ 6693547-10516478 = -3,822,931 no-pdf-link (still many!)
+ 1876914-1215179 = +661,735 no-capture
+ 3080986-265689 = +2,815,297 skip-url-blocklist
+
+Overall about half a million new 'success', pretty good. over 750k new
+no-capture for crawling.
+
+## Seedlist Dumps
+
+Note that this is just seedlists, not full ingest requests.
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ ) TO '/srv/sandcrawler/tasks/patch_2021-09-16_terminal_seedlist.txt';
+ => 6,354,365
+
+Then run the actual patch crawl!
+
+## Ingest Requests for Bulk Retry (2022-01-06)
+
+Crawl has just about completed, so running another round of bulk ingest
+requests, slightly updated to allow `https://doi.org/10*` in terminal URL:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.updated <= '2022-01-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json';
+ => 4,488,193
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json
+ => DONE
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => TIMEDOUT
+ => (probably due to re-assignment)
+ => DONE
+
+## Stats Again (just OAI-PMH)
+
+OAI-PMH query:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+On 2022-02-08:
+
+ status | count
+ -----------------------+----------
+ success | 13505143
+ no-pdf-link | 8741007
+ no-capture | 4429986
+ redirect-loop | 1566611
+ terminal-bad-status | 816162
+ link-loop | 459006
+ wrong-mimetype | 448983
+ null-body | 71871
+ cdx-error | 19055
+ | 15275
+ petabox-error | 11713
+ blocked-cookie | 11664
+ wayback-error | 8745
+ skip-url-blocklist | 7828
+ max-hops-exceeded | 2031
+ wayback-content-error | 338
+ body-too-large | 280
+ spn2-error:job-failed | 191
+ bad-redirect | 134
+ redirects-exceeded | 120
+ (20 rows)
+
+
+On 2022-02-28, after bulk ingest completed:
+
+ status | count
+ -----------------------+----------
+ success | 14668123
+ no-pdf-link | 8822460
+ no-capture | 2987565
+ redirect-loop | 1629015
+ terminal-bad-status | 917851
+ wrong-mimetype | 466512
+ link-loop | 460941
+ null-body | 71457
+ cdx-error | 19636
+ petabox-error | 16198
+ | 15275
+ blocked-cookie | 11885
+ wayback-error | 8779
+ skip-url-blocklist | 7838
+ empty-blob | 5906
+ max-hops-exceeded | 5563
+ wayback-content-error | 355
+ body-too-large | 329
+ spn2-error:job-failed | 191
+ bad-redirect | 137
+ (20 rows)
+
+
+Comparing to a couple months ago:
+
+ 14668123-13258356 = +1,409,767 success
+ 8822460-8685519 = + 136,941 no-pdf-link
+ 2987565-4765663 = -1,778,098 no-capture
+ 917851-803373 = + 114,478 terminal-bad-status
+
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md
new file mode 100644
index 0000000..786c3b2
--- /dev/null
+++ b/notes/ingest/2021-12-13_datasets.md
@@ -0,0 +1,504 @@
+
+First round of production dataset ingest. Aiming to get one or two small
+repositories entirely covered, and a few thousand datasets from all supported
+platforms.
+
+Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up
+to a TByte of content locally (on spinning disk). For successful output, will
+run through fatcat import; for a subset of unsuccessful, will start a small
+heritrix crawl.
+
+
+## Ingest Generation
+
+Summary:
+
+ wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json
+ 2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+ 1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ 2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+All the below ingest requests were combined into a single large file:
+
+ cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz
+ # 24.7k 0:00:00 [91.9k/s]
+
+### Figshare
+
+- sample 10k datasets (not other types)
+- want only "versioned" DOIs; use regex on DOI to ensure
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \
+ | rg '10\.6084/m9\.figshare\.\d+.v\d+' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000})
+
+### Zenodo
+
+- has DOIs (of course)
+- want only "versioned" DOIs? how to skip?
+- sample 10k
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \
+ | rg '10\.5281/zenodo' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+### Goettingen Research Online
+
+- <https://data.goettingen-research-online.de/>
+- Dataverse instance, not harvard-hosted
+- ~1,400 datasets, ~10,500 files
+- has DOIs
+- `doi_prefix:10.25625`, then filter to only one slash
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \
+ | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \
+ | shuf \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739}) # 1.7k 0:01:29 [ 19 /s]
+
+### Harvard Dataverse
+
+- main harvard dataverse instance, many "sub-dataverses"
+- ~137,000 datasets, ~1,400,000 files
+- 10k sample
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \
+ | rg '10\.7910/dvn/[a-z0-9]{6}' \
+ | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000}) # 2.97k 0:03:26 [14.4 /s]
+
+Note that this was fewer than expected, but moving on anyways.
+
+### archive.org
+
+A couple hand-filtered items.
+
+"CAT" dataset
+- item: <https://archive.org/details/CAT_DATASET>
+- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui`
+
+"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing"
+- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62
+- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper)
+
+
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/CAT_DATASET",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "36vy7s5gtba67fmyxlmijpsaui",
+ "work_ident": "ycqtbhnfmzamheq2amztiwbsri"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "36vy7s5gtba67fmyxlmijpsaui"
+ }
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu",
+ "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu"
+ }
+
+ # paste and then Ctrl-D:
+ cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+
+
+## Ingest Command
+
+On `wbgrp-svc263`.
+
+In the current version of tool, `skip_cleanup_local_files=True` by default, so
+files will stick around.
+
+Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output.
+
+
+ # first a small sample
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | head -n5 \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json
+
+ # ok, run the whole batch through
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json
+
+Got an error:
+
+ internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`?
+
+Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work:
+
+ AttributeError: 'ArchiveSession' object has no attribute 'upload'
+
+Going to hack with config in homedir for now.
+
+Extract URLs for crawling:
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt
+
+### Exceptions Encountered
+
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process
+ internetarchive.upload
+ [...]
+ ConnectionResetError: [Errno 104] Connection reset by peer
+ urllib3.exceptions.ProtocolError
+ requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5')
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process
+ r.raise_for_status()
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status
+ raise HTTPError(http_error_msg, response=self)
+ requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201
+
+download sometimes just slowly time out, like after a day or more
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path
+ mimetype = magic.Magic(mime=True).from_file(path)
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file
+ with _real_open(filename):
+ FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz'
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request
+ obj_latest = obj["data"]["latestVersion"]
+ KeyError: 'latestVersion'
+
+Fixed the above, trying again:
+
+ git log | head -n1
+ # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c
+
+ Wed Dec 15 21:57:42 UTC 2021
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json
+
+Zenodo seems really slow, let's try filtering those out:
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json
+ # 3.76k 15:12:53 [68.7m/s]
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json
+
+## Fatcat Import
+
+ wc -l ingest_dataset_combined_results*.json
+ 126 ingest_dataset_combined_results2.json
+ 153 ingest_dataset_combined_results3.json
+ 275 ingest_dataset_combined_results4.json
+ 3762 ingest_dataset_combined_results5.json
+ 7736 ingest_dataset_combined_results6.json
+ 182 ingest_dataset_combined_results.json
+ 5 ingest_dataset_combined_results.ramp.json
+ 12239 total
+
+ cat ingest_dataset_combined_results*.json \
+ | rg '^\{' \
+ | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \
+ | sort \
+ | uniq --check-chars 26 \
+ | cut -f2 \
+ | rg -v '\\\\' \
+ | pv -l \
+ > uniq_ingest_dataset_combined_results.json
+ # 9.48k 0:00:06 [1.54k/s]
+
+ cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr
+ 7941 no-capture
+ 374 platform-404
+ 369 terminal-bad-status
+ 348 success-file
+ 172 success
+ 79 platform-scope
+ 77 error-platform-download
+ 47 empty-manifest
+ 27 platform-restricted
+ 20 too-many-files
+ 12 redirect-loop
+ 6 error-archiveorg-upload
+ 3 too-large-size
+ 3 mismatch
+ 1 no-platform-match
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success") | .' -c \
+ > uniq_ingest_dataset_combined_results.success.json
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success-file") | .' -c \
+ > uniq_ingest_dataset_combined_results.success-file.json
+
+On fatcat QA instance:
+
+ git log | head -n1
+ # commit cca680e2cc4768a4d45e199f6256a433b25b4075
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+Need to update fatcat file worker to support single-file filesets... was that the plan?
+
+ head /tmp/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0})
+
+Trying again 2022-03-23:
+
+ git log | head -n1
+ # commit 134cb050988be2c545af89e0a67c4998307bb819
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0})
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0})
+
+Fixed a small logic error in insert path.
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0})
+
+archive.org datasets are *not* getting uploaded with the correct path. path
+directory prefixes are getting clobbered.
+
+## Summary
+
+As follow-up, it may be worth doing another manual round of ingest requests.
+After that, would be good to fill in "glue" code so that this can be done with
+kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can
+start scaling up more ingest, using ingest tool, "bulk mode" processing,
+heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest
+process.
+
+For scaling, let's do a "full" ingest request generation of all datasets, and
+crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens
+of millions of mostly DOIs (doi.org URLs), should crawl quickly.
+
+Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio.
+uploading large datasets to archive.org, but not doing SPN web requests. Feed
+the resulting huge file seedlist into a heritrix crawl to download web files.
+
+Will need to add support for more specific platforms.
+
+
+### Huge Bulk Ingest Prep
+
+On prod instance:
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz
+ # Expecting 11264787 release objects in search queries
+ # TIMEOUT ERROR
+ # 6.07M 19:13:02 [87.7 /s] (partial)
+
+As follow-up, should do a full batch (not partial). For now search index is too
+unreliable (read timeouts).
+
+ zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \
+ | jq .base_url -r \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > ingest_dataset_bulk.2022-01-05.partial.schedule
+
+## Retries (2022-01-12)
+
+This is after having done a bunch of crawling.
+
+ cat ingest_dataset_combined_results6.json \
+ | rg '"no-capture"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request -c \
+ | pv -l \
+ > ingest_dataset_retry.json
+ => 6.51k 0:00:01 [3.55k/s]
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json
+
+## Retries (2022-02)
+
+Finally got things to complete end to end for this batch!
+
+ cat ingest_dataset_retry_results5.json | jq .status -r | sort | uniq -c | sort -nr
+ 3220 terminal-bad-status
+ 2120 no-capture
+ 380 empty-manifest
+ 264 success-file
+ 251 success
+ 126 success-existing
+ 39 mismatch
+ 28 error-platform-download
+ 24 too-many-files
+ 20 platform-scope
+ 13 platform-restricted
+ 13 mismatch-size
+ 6 too-large-size
+ 3 transfer-encoding-error
+ 2 no-platform-match
+ 2 error-archiveorg-upload
+ 1 redirect-loop
+ 1 empty-blob
+
+Some more URLs to crawl:
+
+ cat ingest_dataset_retry_results5.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.base_url.txt
+ # 1.00
+ # just a single DOI that failed to crawl, for whatever reason
+
+ cat ingest_dataset_retry_results5.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt
+
+These are ready to crawl, in the existing dataset crawl.
+
+ cat /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule
+
+## Running Uploads Again
+
+Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a
+big bummer! Will need to download many of these over again.
+
+ # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316
+ # skip_cleanup_local_files=True is still default
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json
+
+ # filter out zenodo, very slow:
+ # rg -v 10.5281 \
diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md
new file mode 100644
index 0000000..941519f
--- /dev/null
+++ b/notes/ingest/2022-01-06_patch_crawl.md
@@ -0,0 +1,398 @@
+
+Starting another paper fulltext patch crawl, targetting recent OA content which
+has failed to ingest, and platforms (arxiv, etc).
+
+Specifically:
+
+- "daily" changelog ingest requests from all time, which failed with various status codes
+- pdf no-capture
+- SPN errors
+- terminal-bad-status with 5xx, 429
+- gateway-timeout
+- html no-capture
+- html-resource-no-capture
+
+Most of these are dumped in a single complex query (below),
+
+TODO: html-resource-no-capture (from error message? or do SPN requests separately?)
+
+
+## Initial 'no-capture' Seedlist
+
+Dump terminal URLs (will do ingest requests later, using similar command):
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt';
+ => COPY 6389683
+
+TODO: filter out archive.org/www.archive.org
+
+ cat patch_terminal_url.2022-01-12.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-01-12.uniq.txt
+ => 5.73M 0:00:47 [ 120k/s]
+
+ # note: tweaks and re-ran the above after inspecting this output
+ cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 799045 doi.org
+ 317557 linkinghub.elsevier.com
+ 211091 arxiv.org
+ 204334 iopscience.iop.org
+ 139758 dialnet.unirioja.es
+ 130331 www.scielo.br
+ 124626 www.persee.fr
+ 85764 digitalrepository.unm.edu
+ 83913 www.mdpi.com
+ 79662 www.degruyter.com
+ 75703 www.e-periodica.ch
+ 72206 dx.doi.org
+ 69068 escholarship.org
+ 67848 idus.us.es
+ 57907 zenodo.org
+ 56624 ir.opt.ac.cn
+ 54983 projecteuclid.org
+ 52226 rep.bntu.by
+ 48376 osf.io
+ 48009 pubs.rsc.org
+ 46947 publikationen.ub.uni-frankfurt.de
+ 45564 www.research-collection.ethz.ch
+ 45153 dk.um.si
+ 43313 www.ssoar.info
+ 40543 scholarworks.umt.edu
+
+TODO: cleanup ingest request table in sandcrawler-db:
+- remove filtered OAI-PMH prefixes
+- remove any invalid `base_url` (?)
+
+## More Seedlist (2022-02-08)
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt';
+ => COPY 444764
+
+ cat patch_terminal_url.2022-02-08.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-02-08.uniq.txt
+ => 426k 0:00:04 [ 103k/s]
+
+ cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 60123 www.degruyter.com
+ 59314 arxiv.org
+ 43674 zenodo.org
+ 17771 doi.org
+ 9501 linkinghub.elsevier.com
+ 9379 www.mdpi.com
+ 5691 opendata.uni-halle.de
+ 5578 scholarlypublishingcollective.org
+ 5451 era.library.ualberta.ca
+ 4982 www.cairn.info
+ 4306 www.taylorfrancis.com
+ 4189 papers.ssrn.com
+ 4157 apps.crossref.org
+ 4089 www.sciencedirect.com
+ 4033 mdpi-res.com
+ 3763 dlc.mpg.de
+ 3408 osf.io
+ 2603 www.frontiersin.org
+ 2594 watermark.silverchair.com
+ 2569 journals.lww.com
+ 1787 underline.io
+ 1680 archiviostorico.fondazione1563.it
+ 1658 www.jstage.jst.go.jp
+ 1611 cyberleninka.ru
+ 1535 www.schoeningh.de
+
+ cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule
+ => Done
+
+Copied to crawler svc206 and added to frontier.
+
+
+## Bulk Ingest Requests (2022-02-28)
+
+Note that we are skipping OAI-PMH here, because we just did a separate ingest
+for those.
+
+This is going to dump many duplicate lines (same `base_url`, multiple
+requests), but that is fine. Expecting something like 7 million rows.
+
+ COPY (
+ -- SELECT ingest_file_result.terminal_url
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND ingest_file_result.updated <= '2022-02-08'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ -- ingest_request.link_source = 'oai'
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json';
+ # COPY 3053219
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json
+ => DONE
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md
new file mode 100644
index 0000000..a6f08dd
--- /dev/null
+++ b/notes/ingest/2022-01-13_doi_crawl.md
@@ -0,0 +1,248 @@
+
+Could roll this in to current patch crawl instead of starting a new crawl from scratch.
+
+This file is misnamed; these are mostly non-DOI-specific small updates.
+
+## KBART "almost complete" experimentation
+
+Random 10 releases:
+
+ cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}'
+ https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone
+ https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed
+ https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works
+ https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern)
+ https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy
+ https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success
+ https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref
+ https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success
+ https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success
+ https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed
+
+Try some more!
+
+ https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success
+ https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success?
+ https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry
+ https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site
+ https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI
+ https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success
+ https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success
+ https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken
+ https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub)
+ https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success
+
+
+## Seeds: fixed OJS URLs
+
+Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like:
+
+- `no-pdf-link` with terminal URL like `/article/view/`
+- `redirect-loop` with terminal URL like `/article/view/`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json';
+ => COPY 326577
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json
+ cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Done/running.
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'link-loop'
+ )
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt';
+ => COPY 342415
+
+ cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule
+
+Done/seeded.
+
+## Seeds: scitemed.com
+
+Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%/article/view/%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json';
+ # SKIPPED
+
+Actually there are very few of these.
+
+## Seeds: non-OA paper DOIs
+
+There are many DOIs out there which are likely to be from small publishers, on
+the web, and would ingest just fine (eg, in OJS).
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count
+ 30,938,106
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count
+ 6,664,347
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count
+ 8,258,111
+
+Do the 8 million first, then maybe try the 30.9 million later? Do sampling to
+see how many are actually accessible? From experience with KBART generation,
+many of these are likely to crawl successfully.
+
+ ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz
+ # re-running 2022-02-08 after this VM was upgraded
+ # Expecting 8321448 release objects in search queries
+ # DONE
+
+This is large enough that it will probably be a bulk ingest, and then probably
+a follow-up crawl.
+
+## Seeds: HTML and XML links from HTML biblio
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \
+ | pv -l \
+ | rg '"(html|xml)_fulltext_url"' \
+ | rg '"no-pdf-link"' \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.json.gz
+
+ # cut this off at some point? gzip is terminated weird
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz | wc -l
+ # gzip: ingest_file_result_fulltext_urls.2022-01-13.json.gz: unexpected end of file
+ # 2,538,433
+
+Prepare seedlists (to include in heritrix patch crawl):
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+ | jq .html_biblio.xml_fulltext_url -r \
+ | rg '://' \
+ | sort -u -S 4G \
+ | pv -l \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz
+ # 1.24M 0:01:35 [12.9k/s]
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+ | jq .html_biblio.html_fulltext_url -r \
+ | rg '://' \
+ | sort -u -S 4G \
+ | pv -l \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz
+ # 549k 0:01:27 [6.31k/s]
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+ | cut -f3 -d/ \
+ | sort -S 4G \
+ | uniq -c \
+ | sort -nr \
+ | head -n20
+
+ 534005 dlc.library.columbia.edu
+ 355319 www.degruyter.com
+ 196421 zenodo.org
+ 101450 serval.unil.ch
+ 100631 biblio.ugent.be
+ 47986 digi.ub.uni-heidelberg.de
+ 39187 www.emerald.com
+ 33195 www.cairn.info
+ 25703 boris.unibe.ch
+ 19516 journals.openedition.org
+ 15911 academic.oup.com
+ 11091 repository.dl.itc.u-tokyo.ac.jp
+ 9847 oxfordworldsclassics.com
+ 9698 www.thieme-connect.de
+ 9552 www.idunn.no
+ 9265 www.zora.uzh.ch
+ 8030 www.scielo.br
+ 6543 www.hanspub.org
+ 6229 asmedigitalcollection.asme.org
+ 5651 brill.com
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+ | awk '{print "F+ " $1}' \
+ > ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+ wc -l ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+ 1785901 ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+Added to `JOURNALS-PATCH-CRAWL-2022-01`
+
+## Seeds: most doi.org terminal non-success
+
+Unless it is a 404, should retry.
+
+TODO: generate this list
+
+## Non-OA DOI Bulk Ingest
+
+Had previously run:
+
+ cat ingest_nonoa_doi.json.gz \
+ | rg -v "doi.org/10.2139/" \
+ | rg -v "doi.org/10.1021/" \
+ | rg -v "doi.org/10.1121/" \
+ | rg -v "doi.org/10.1515/" \
+ | rg -v "doi.org/10.1093/" \
+ | rg -v "europepmc.org" \
+ | pv -l \
+ | gzip \
+ > nonoa_doi.filtered.ingests.json.gz
+ # 7.35M 0:01:13 [99.8k/s]
+
+Starting a bulk ingest of these on 2022-03-18, which is *before* the crawl has
+entirely finished, but after almost all queues (domains) have been done for
+several days.
+
+ zcat nonoa_doi.filtered.ingests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Looks like many jstage `no-capture` status; these are still (slowly) crawling.
diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md
new file mode 100644
index 0000000..9722459
--- /dev/null
+++ b/notes/ingest/2022-03_doaj.md
@@ -0,0 +1,278 @@
+
+plan:
+- usual setup and dump ingest requests
+- filter ingest requests to targetted ccTLDs, and add those to crawl first
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz'
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz
+ # 9.08M 0:37:38 [4.02k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373})
+
+
+## Check Pre-Crawl Status
+
+2022-03-09, before the above load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 2919808
+ html | wrong-scope | 1098998
+ pdf | no-pdf-link | 481532
+ pdf | redirect-loop | 429006
+ html | success | 342501
+ html | unknown-scope | 225390
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187762
+ html | no-capture | 185418
+ pdf | no-capture | 171273
+ pdf | null-body | 129028
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91551
+ pdf | link-loop | 25447
+ html | wrong-mimetype | 22640
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ pdf | wrong-mimetype | 7688
+ xml | success | 6897
+ html | petabox-error | 5529
+ pdf | wayback-error | 2706
+ xml | null-body | 2353
+ pdf | | 2063
+ pdf | wayback-content-error | 1349
+ html | cdx-error | 1169
+ pdf | cdx-error | 1130
+ pdf | petabox-error | 679
+ html | | 620
+ pdf | empty-blob | 562
+ html | blocked-cookie | 545
+ (30 rows)
+
+After the above load:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3036457
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108132
+ pdf | no-pdf-link | 485703
+ pdf | redirect-loop | 436085
+ html | success | 342594
+ html | unknown-scope | 225412
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187999
+ html | no-capture | 187310
+ pdf | no-capture | 172033
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91799
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6897
+ html | petabox-error | 5530
+ pdf | wayback-error | 2707
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 771
+ pdf | empty-blob | 562
+ (30 rows)
+
+Dump ingest requests for crawling (or bulk ingest first?):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json';
+ => COPY 353819
+
+Not that many! Guess the filters are important?
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ );
+ => 3202164
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json
+ => 353k 0:00:16 [21.0k/s]
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Dump seeds again (for crawling):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json';
+ # COPY 350661
+
+And stats again:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3037059
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108476
+ pdf | no-pdf-link | 485705
+ pdf | redirect-loop | 436850
+ html | success | 342762
+ html | unknown-scope | 225412
+ html | redirect-loop | 224683
+ html | html-resource-no-capture | 188058
+ html | no-capture | 185734
+ pdf | no-capture | 170452
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91875
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19042
+ html | terminal-bad-status | 13333
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6898
+ html | petabox-error | 5535
+ pdf | wayback-error | 2711
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 772
+ html | blocked-cookie | 769
+ (30 rows)
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json
+
+Create seedlist:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | jq -r .base_url \
+ | sort -u -S 4G \
+ > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt
+
+Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will
+re-ingest when that completes (a week or two?).
+
+
+## Bulk Ingest
+
+After `TARGETED-ARTICLE-CRAWL-2022-03` wrap-up.
+
+ # 2022-03-22
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md
new file mode 100644
index 0000000..d2a8d71
--- /dev/null
+++ b/notes/ingest/2022-03_oaipmh.md
@@ -0,0 +1,40 @@
+
+Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl.
+
+Note that Martin excluded many Indonesian endpoints, will need to follow-up on
+those.
+
+## Prep
+
+Fetch metadata snapshot:
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst
+
+Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large):
+
+ zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \
+ | rg -v 'oai:kb.dk:' \
+ | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \
+ | rg -v 'oai:hispana.mcu.es:' \
+ | rg -v 'oai:bnf.fr:' \
+ | rg -v 'oai:ukm.si:' \
+ | rg -v 'oai:biodiversitylibrary.org:' \
+ | rg -v 'oai:hsp.org:' \
+ | rg -v 'oai:repec:' \
+ | rg -v 'oai:n/a:' \
+ | rg -v 'oai:quod.lib.umich.edu:' \
+ | rg -v 'oai:americanae.aecid.es:' \
+ | rg -v 'oai:www.irgrid.ac.cn:' \
+ | rg -v 'oai:espace.library.uq.edu:' \
+ | rg -v 'oai:edoc.mpg.de:' \
+ | rg -v 'oai:bibliotecadigital.jcyl.es:' \
+ | rg -v 'oai:repository.erciyes.edu.tr:' \
+ | rg -v 'oai:krm.or.kr:' \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz
+
+These failed to transform in the expected way; a change in JSON schema from last time?
diff --git a/notes/ingest/2022-04_targeted.md b/notes/ingest/2022-04_targeted.md
new file mode 100644
index 0000000..23fd35f
--- /dev/null
+++ b/notes/ingest/2022-04_targeted.md
@@ -0,0 +1,144 @@
+
+Want to do a crawl similar to recent "patch" crawls, where we run heritrix
+crawls to "fill in" missing (`no-capture`) and failed dailing ingests (aka,
+those requests coming from fatcat-changelog).
+
+ export PATCHDATE=2022-04-20
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-04
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-04-20.rows.json';
+ # COPY 4842749
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v www.archive.org \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ # 4.75M 0:01:44 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 1515829 www.jstage.jst.go.jp
+ 1052953 doi.org
+ 241704 arxiv.org
+ 219543 www.sciencedirect.com
+ 178562 www.persee.fr
+ 84947 zenodo.org
+ 67397 www.mdpi.com
+ 65775 journals.lww.com
+ 58216 opg.optica.org
+ 50673 osf.io
+ 45776 www.degruyter.com
+ 36664 www.indianjournals.com
+ 35287 pubs.rsc.org
+ 33495 www.bmj.com
+ 33320 www.research-collection.ethz.ch
+ 29728 www.e-periodica.ch
+ 28338 iopscience.iop.org
+ 26364 www.cambridge.org
+ 23840 onlinelibrary.wiley.com
+ 23641 platform.almanhal.com
+ 22660 brill.com
+ 20288 www.osapublishing.org
+ 18561 cgscholar.com
+ 18539 doi.nrct.go.th
+ 15677 www.frontiersin.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+TODO: starting with the "quarterly retry" script/query might make more sense?
+TODO: are there any cases where we do a bulk ingest request, fail, and `terminal_url` is not set?
+
+## Bulk Ingest Requests (post-crawl)
+
+ cd /srv/sandcrawler/src/python
+ sudo su sandcrawler
+ pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json
+ => 4.84M 0:03:14 [24.9k/s]
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => started 2022-05-11
diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md
new file mode 100644
index 0000000..bc78998
--- /dev/null
+++ b/notes/ingest/2022-04_unpaywall.md
@@ -0,0 +1,278 @@
+
+New unpaywall snapshot from `2022-03-09`.
+
+This will probably be the last unpaywall crawl? Will switch to openalex in the
+future, because we can automate that ingest process, and run it on our own
+schedule.
+
+ export SNAPSHOT=2022-03-09
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=UNPAYWALL-CRAWL-2022-04
+
+## Download and Archive
+
+ wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz'
+ # 2022-04-09 22:31:43 (98.9 KB/s) - ‘unpaywall_snapshot_2022-03-09T083001.jsonl.gz’ saved [29470830470/29470830470]
+
+ export SNAPSHOT=2022-03-09
+ ia upload unpaywall_snapshot_$SNAPSHOT unpaywall_snapshot_$SNAPSHOT*.jsonl.gz -m title:"Unpaywall Metadata Snapshot ($SNAPSHOT)" -m collection:ia_biblio_metadata -m creator:creator -m date:$SNAPSHOT
+
+ # if needed
+ scp unpaywall_snapshot_$SNAPSHOT*.jsonl.gz wbgrp-svc506.us.archive.org:/srv/sandcrawler/tasks
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ cd /srv/sandcrawler/src/python
+ sudo su sandcrawler
+ pipenv shell
+
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT*.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json
+ # 34.9M 3:02:32 [3.19k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ # 34.9M 5:23:15 [1.80k/s]
+ # Worker: Counter({'total': 34908779, 'insert-requests': 6129630, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 34908779, 'pushed': 34908779})
+
+So about 6.1M new ingest request rows.
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- take "all time" instead of just this recent capture
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2022-03-09.rows.json';
+ => COPY 6025671
+
+ # transform
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json
+ # 6.03M 0:03:26 [29.1k/s]
+
+ # enqueue for bulk processing
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2022-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3330232
+ success | 2455102
+ redirect-loop | 197117
+ terminal-bad-status | 82618
+ no-pdf-link | 33046
+ blocked-cookie | 16078
+ link-loop | 6745
+ wrong-mimetype | 3416
+ wayback-error | 1385
+ empty-blob | 1142
+ cdx-error | 820
+ body-too-large | 292
+ bad-gzip-encoding | 281
+ wayback-content-error | 267
+ | 253
+ petabox-error | 215
+ skip-url-blocklist | 185
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+After prior "TARGETED" crawl and bulk ingest finished:
+
+ status | count
+ -------------------------+---------
+ no-capture | 3330055
+ success | 2455279
+ redirect-loop | 197117
+ terminal-bad-status | 82618
+ no-pdf-link | 33046
+ blocked-cookie | 16079
+ link-loop | 6745
+ wrong-mimetype | 3416
+ wayback-error | 1385
+ empty-blob | 1142
+ cdx-error | 820
+ body-too-large | 292
+ bad-gzip-encoding | 281
+ wayback-content-error | 267
+ | 253
+ petabox-error | 215
+ skip-url-blocklist | 185
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+Almost no change, which makes sense because of the `ingest_request.created`
+filter.
+
+
+## Dump Seedlist
+
+Dump rows for crawling:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ -- AND date(ingest_request.created) > '2022-04-01'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json';
+ => before ingest and arxiv.org DOI exclusion: COPY 3309091
+ => COPY 3308914
+
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json
+ => 3.31M 0:02:22 [23.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT*
+ 15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt
+ 3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json
+ 3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt
+ 3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt
+
+Inject seedlist into crawler:
+
+ scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+Top domains?
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c | sort -nr | head -n20
+ 158497 www.scielo.br
+ 144732 onlinelibrary.wiley.com
+ 129349 www.researchsquare.com
+ 94923 hal.archives-ouvertes.fr
+ 69293 openresearchlibrary.org
+ 64584 www.cell.com
+ 60033 link.springer.com
+ 50528 www.degruyter.com
+ 49737 projecteuclid.org
+ 45841 www.jstage.jst.go.jp
+ 44819 www.mdpi.com
+ 44325 ieeexplore.ieee.org
+ 38091 dr.lib.iastate.edu
+ 31030 www.nature.com
+ 30300 discovery.ucl.ac.uk
+ 27692 ntrs.nasa.gov
+ 24215 orca.cardiff.ac.uk
+ 23653 www.frontiersin.org
+ 23474 pure.rug.nl
+ 22660 www.sciencedirect.com
+
+
+## Post-Crawl bulk ingest
+
+ # enqueue for bulk processing
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # done: 2022-07-06
+
+## Post-Crawl, Post-Ingest Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2022-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 4784948 => +2,329,669 ~77%
+ redirect-loop | 485270 => + 288,153 ~10%
+ no-capture | 317598 => -3,012,457
+ terminal-bad-status | 267853 => + 185,235 ~ 6%
+ no-pdf-link | 118303 => + 85,257
+ blocked-cookie | 111373 => + 95,294
+ skip-url-blocklist | 19368
+ link-loop | 9091
+ wrong-mimetype | 7163
+ cdx-error | 2516
+ empty-blob | 1961
+ wayback-error | 1922
+ body-too-large | 509
+ petabox-error | 416
+ wayback-content-error | 341
+ bad-gzip-encoding | 281
+ | 253
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+Groovy!
diff --git a/notes/ingest/2022-07-15_ingest_fixes.md b/notes/ingest/2022-07-15_ingest_fixes.md
new file mode 100644
index 0000000..ec31a7d
--- /dev/null
+++ b/notes/ingest/2022-07-15_ingest_fixes.md
@@ -0,0 +1,831 @@
+
+## HTML `html-resource-no-capture` Fixes
+
+Tracing down some `html-resource-no-capture` issues. Eg, `javascript:` resources causing errors.
+
+SQL query:
+
+ select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' limit 100;
+ select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' order by random() limit 100;
+
+ select count(*) from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture';
+ => 210,528
+
+http://agroengineering.it/index.php/jae/article/view/568/609
+- old capture, from `20171017204935`
+- missing .css file; seems like an actual case of missing content?
+- TODO: re-crawl/re-ingest when CDX is old
+
+https://www.karger.com/Article/FullText/484130
+- missing: https://www.karger.com/WebMaterial/ShowThumbnail/895999?imgType=2
+- resource is live
+- this was from DOI-LANDING crawl, no resources captured
+- TODO: re-crawl
+
+https://www.mdpi.com/1996-1073/13/21/5563/htm
+- missing: https://www.mdpi.com/1996-1073/13/21/5563/htm
+- common crawl capture; no/few resources?
+- TODO: re-crawl
+
+http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-736X2013000500011&lng=en&tlng=en
+- missing: http://www.scielo.br/img/revistas/pvb/v33n5/a11tab01.jpg
+ not on live web
+- old (2013) wide crawl
+- TODO: re-crawl
+
+http://g3journal.org/lookup/doi/10.1534/g3.116.027730
+- missing: http://www.g3journal.org/sites/default/files/highwire/ggg/6/8/2553/embed/mml-math-4.gif
+- old 2018 landing crawl (no resources)
+- TODO: re-crawl
+
+https://www.frontiersin.org/articles/10.3389/fimmu.2020.576134/full
+- "error_message": "revisit record missing URI and/or DT: warc:abc.net.au-news-20220328-130654/IA-FOC-abc.net.au-news-20220618135308-00003.warc.gz offset:768320762"
+- specific URL: https://www.frontiersin.org/areas/articles/js/app?v=uC9Es8wJ9fbTy8Rj4KipiyIXvhx7XEVhCTHvIrM4ShA1
+- archiveteam crawl
+- seems like a weird corner case. look at more 'frontiersin' articles, and re-crawl this page
+
+https://www.frontiersin.org/articles/10.3389/fonc.2020.01386/full
+- WORKING
+
+https://doi.org/10.4000/trajectoires.2317
+- redirect: https://journals.openedition.org/trajectoires/2317
+- missing: "https://journals.openedition.org/trajectoires/Ce fichier n'existe pas" (note spaces)
+- FIXED
+
+http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S1413-81232002000200008&lng=en&tlng=en
+- WORKING
+
+https://f1000research.com/articles/9-571/v2
+- petabox-error on 'https://www.recaptcha.net/recaptcha/api.js'
+- added recaptcha.net to blocklist
+- still needs a re-crawl
+- SPN capture, from 2020, but images were missing?
+- re-capture has images (though JS still wonky)
+- TODO: re-crawl with SPN2
+
+http://bio.biologists.org/content/4/9/1163
+- DOI LANDING crawl, no sub-resources
+- TODO: recrawl
+
+http://err.ersjournals.com/content/26/145/170039.full
+- missing: http://err.ersjournals.com/sites/default/files/highwire/errev/26/145/170039/embed/graphic-5.gif
+ on live web
+- 2017 targetted heritrix crawl
+- TODO: recrawl
+
+http://www.dovepress.com/synthesis-characterization-and-antimicrobial-activity-of-an-ampicillin-peer-reviewed-article-IJN
+- missing: https://www.dovepress.com/cr_data/article_fulltext/s61000/61143/img/IJN-61143-F02-Thumb.jpg
+- recent archiveteam crawl
+- TODO: recrawl
+
+http://journals.ed.ac.uk/lithicstudies/article/view/1444
+- missing: http://journals.ed.ac.uk/lithicstudies/article/download/1444/2078/6081
+- common crawl
+- TODO: recrawl
+
+http://medisan.sld.cu/index.php/san/article/view/495
+- missing: http://ftp.scu.sld.cu/galen/medisan/logos/redib.jpg
+- this single resource is legit missing
+
+seems like it probably isn't a bad idea to just re-crawl all of these with fresh SPNv2 requests
+
+request sources:
+- fatcat-changelog (doi)
+- fatcat-ingest (doi)
+- doaj
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'html'
+ AND ingest_file_result.status = 'html-resource-no-capture'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json';
+ => COPY 210749
+
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json
+
+Try a sample of 300:
+
+ shuf -n300 /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Seeing a bunch of:
+
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fphys.2020.00454/full","https://www.frontiersin.org/articles/10.3389/fphys.2020.00454/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fmicb.2019.02507/full","https://www.frontiersin.org/articles/10.3389/fmicb.2019.02507/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+ ["doaj","wayback-content-error","https://www.mdpi.com/2218-1989/10/9/366","https://www.mdpi.com/2218-1989/10/9/366/htm","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:964129887"]
+
+ "error_message": "revisit record missing URI and/or DT: warc:online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz offset:751923069",
+
+
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fnins.2020.00724/full","https://www.frontiersin.org/articles/10.3389/fnins.2020.00724/full","wayback payload sha1hex mismatch: 20220715222216 https://static.frontiersin.org/areas/articles/js/app?v=DfnFHSIgqDJBKQy2bbQ2S8vWyHe2dEMZ1Lg9o6vSS1g1"]
+
+These seem to be transfer encoding issues; fixed?
+
+ ["doaj","html-resource-no-capture","http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S0021-25712013000400003&lng=en&tlng=en","https://scielosp.org/article/aiss/2013.v49n4/336-339/en/","HTML sub-resource not found: https://ssm.scielo.org/media/assets/css/scielo-print.css"]
+
+Full batch:
+
+ # TODO: cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Not running the full batch for now, because there are almost all `wayback-content-error` issues.
+
+ cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | wc -l
+ 114935
+
+ cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+
+## Redirect Loops
+
+Seems like there might have been a bug in how ingest pipeline dealt with
+multiple redirects (eg, 301 to 302 or vice-versa), due to how CDX lookups and
+normalization was happening.
+
+This could be a really big deal because we have over 11 million such ingest
+requests! and may even have stopped crawling domains on the basis of redirect
+looping.
+
+ select * from ingest_file_result where ingest_type = 'pdf' and status = 'redirect-loop' limit 50;
+
+http://ieeexplore.ieee.org/iel7/7259950/7275573/07275755.pdf
+- 'skip-url-blocklist'
+- paywall on live web
+
+http://www.redjournal.org/article/S0360301616308276/pdf
+- redirect to 'secure.jbs.elsevierhealth.com'
+- ... but re-crawling with SPNv2 worked
+- TODO: reingest this entire journal with SPNv2
+
+http://www.jmirs.org/article/S1939865415001551/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+http://www.cell.com/article/S0006349510026147/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- TODO: try SPNv2?
+- RECRAWL: success
+
+http://infoscience.epfl.ch/record/256431/files/SPL_2018.pdf
+- FIXED: success
+
+http://www.nature.com/articles/hdy1994143.pdf
+- blocked-cookie (idp.nature.com / cookies_not_supported)
+- RECRAWL: gateway-timeout
+
+http://www.thelancet.com/article/S0140673619327606/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+https://pure.mpg.de/pubman/item/item_2065970_2/component/file_2065971/Haase_2014.pdf
+- FIXED: success
+
+http://hdl.handle.net/21.11116/0000-0001-B1A2-F
+- FIXED: success
+
+http://repositorio.ufba.br/ri/bitstream/ri/6072/1/%2858%29v21n6a03.pdf
+- FIXED: success
+
+http://www.jto.org/article/S1556086416329999/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+http://www.jahonline.org/article/S1054139X16303020/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+So, wow wow wow, a few things to do here:
+
+- just re-try all these redirect-loop attempts to update status
+- re-ingest all these elsevierhealth blocked crawls with SPNv2. this could take a long time!
+
+Possibly the elsevierhealth stuff will require some deeper fiddling to crawl
+correctly.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'redirect-loop'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json';
+ => COPY 6611342
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json
+
+Start with a sample:
+
+ shuf -n200 /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Wow that is a lot of ingest! And a healthy fraction of 'success', almost all
+via unpaywall (maybe should have done DOAJ/DOI only first). Let's do this full
+batch:
+
+ cat /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+TODO: repeat with broader query (eg, OAI-PMH, MAG, etc).
+
+## Other
+
+Revist resolution failed: \"Didn't get exact CDX url/datetime match. url:https://www.cairn.info/static/images//logo/logo-cairn-negatif.png dt:20220430145322 got:CdxRow(surt='info,cairn)/static/images/logo/logo-cairn-negatif.png', datetime='20220430145322', url='https://www.cairn.info/static/images/logo/logo-cairn-negatif.png', mimetype='image/png', status_code=200, sha1b32='Y3VQOPO2NFUR2EUWNXLYGYGNZPZLQYHU', sha1hex='c6eb073dda69691d12966dd78360cdcbf2b860f4', warc_csize=10875, warc_offset=2315284914, warc_path='archiveteam_archivebot_go_20220430212134_59230631/old.worldurbancampaign.org-inf-20220430-140628-acnq5-00000.warc.gz')\""
+
+ https://www.cairn.info/static/images//logo/logo-cairn-negatif.png 20220430145322
+ https://www.cairn.info/static/images/logo/logo-cairn-negatif.png 20220430145322
+
+Fixed!
+
+
+## Broken WARC Record?
+
+cdx line:
+
+ net,cloudfront,d1bxh8uas1mnw7)/assets/embed.js 20220716084026 https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js warc/revisit - U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB - - 660 751923069 online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz
+
+download WARC and run:
+
+ zcat IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz | rg d1bxh8uas1mnw7.cloudfront.net/assets/embed.js -a -C 20
+
+the WARC record:
+
+ WARC/1.0
+ WARC-Type: revisit
+ WARC-Target-URI: https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js
+ WARC-Date: 2022-07-16T08:40:26Z
+ WARC-Payload-Digest: sha1:U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB
+ WARC-IP-Address: 13.227.21.220
+ WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+ WARC-Truncated: length
+ WARC-Record-ID: <urn:uuid:cc79139e-d43f-4b43-9b9e-f923610344d0>
+ Content-Type: application/http; msgtype=response
+ Content-Length: 493
+
+ HTTP/1.1 200 OK
+ Content-Type: application/javascript
+ Content-Length: 512
+ Connection: close
+ Last-Modified: Fri, 22 Apr 2022 08:45:38 GMT
+ Accept-Ranges: bytes
+ Server: AmazonS3
+ Date: Fri, 15 Jul 2022 16:36:08 GMT
+ ETag: "1c28db48d4012f0221b63224a3bb7137"
+ Vary: Accept-Encoding
+ X-Cache: Hit from cloudfront
+ Via: 1.1 5b475307685b5cecdd0df414286f5438.cloudfront.net (CloudFront)
+ X-Amz-Cf-Pop: SFO20-C1
+ X-Amz-Cf-Id: SIRR_1LT8mkp3QVaiGYttPuomxyDfJ-vB6dh0Slg_qqyW0_WwnA1eg==
+ Age: 57859
+
+where are the `WARC-Refers-To-Target-URI` and `WARC-Refers-To-Date` lines?
+
+## osf.io
+
+ select status, terminal_status_code, count(*) from ingest_file_result where base_url LIKE 'https://doi.org/10.17605/osf.io/%' and ingest_type = 'pdf' group by status, terminal_status_code order by count(*) desc limit 30;
+
+ status | terminal_status_code | count
+ -------------------------+----------------------+-------
+ terminal-bad-status | 404 | 92110
+ no-pdf-link | 200 | 46932
+ not-found | 200 | 20212
+ no-capture | | 8599
+ success | 200 | 7604
+ redirect-loop | 301 | 2125
+ terminal-bad-status | 503 | 1657
+ cdx-error | | 1301
+ wrong-mimetype | 200 | 901
+ terminal-bad-status | 410 | 364
+ read-timeout | | 167
+ wayback-error | | 142
+ gateway-timeout | | 139
+ terminal-bad-status | 500 | 76
+ spn2-error | | 63
+ spn2-backoff | | 42
+ petabox-error | | 39
+ spn2-backoff | 200 | 27
+ redirect-loop | 302 | 19
+ terminal-bad-status | 400 | 15
+ terminal-bad-status | 401 | 15
+ remote-server-error | | 14
+ timeout | | 11
+ terminal-bad-status | | 11
+ petabox-error | 200 | 10
+ empty-blob | 200 | 8
+ null-body | 200 | 6
+ spn2-error:unknown | | 5
+ redirect-loop | 308 | 4
+ spn2-cdx-lookup-failure | | 4
+ (30 rows)
+
+Many of these are now non-existant, or datasets/registrations not articles.
+Hrm.
+
+
+## Large DOAJ no-pdf-link Domains
+
+ SELECT
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain,
+ COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_request.base_url = ingest_file_result.base_url
+ WHERE
+ ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source = 'doaj'
+ GROUP BY
+ domain
+ ORDER BY
+ COUNT(*) DESC
+ LIMIT 50;
+
+ domain | count
+ -------------------------------------------------------+--------
+ www.sciencedirect.com | 211090
+ auth.openedition.org | 20741
+ journal.frontiersin.org:80 | 11368
+ journal.frontiersin.org | 6494
+ ejde.math.txstate.edu | 4301
+ www.arkat-usa.org | 4001
+ www.scielo.br | 3736
+ www.lcgdbzz.org | 2892
+ revistas.uniandes.edu.co | 2715
+ scielo.sld.cu | 2612
+ www.egms.de | 2488
+ journals.lww.com | 2415
+ ter-arkhiv.ru | 2239
+ www.kitlv-journals.nl | 2076
+ www.degruyter.com | 2061
+ jwcn-eurasipjournals.springeropen.com | 1929
+ www.cjcnn.org | 1908
+ www.aimspress.com | 1885
+ vsp.spr-journal.ru | 1873
+ dx.doi.org | 1648
+ www.dlib.si | 1582
+ aprendeenlinea.udea.edu.co | 1548
+ www.math.u-szeged.hu | 1448
+ dergipark.org.tr | 1444
+ revistas.uexternado.edu.co | 1429
+ learning-analytics.info | 1419
+ drive.google.com | 1399
+ www.scielo.cl | 1326
+ www.economics-ejournal.org | 1267
+ www.jssm.org | 1240
+ html.rhhz.net | 1232
+ journalofinequalitiesandapplications.springeropen.com | 1214
+ revistamedicina.net | 1197
+ filclass.ru | 1154
+ ceramicayvidrio.revistas.csic.es | 1152
+ gynecology.orscience.ru | 1126
+ www.tobaccoinduceddiseases.org | 1090
+ www.tandfonline.com | 1046
+ www.querelles-net.de | 1038
+ www.swjpcc.com | 1032
+ microbiologyjournal.org | 1028
+ revistas.usal.es | 1027
+ www.medwave.cl | 1023
+ ijtech.eng.ui.ac.id | 1023
+ www.scielo.sa.cr | 1021
+ vestnik.szd.si | 986
+ www.biomedcentral.com:80 | 984
+ scielo.isciii.es | 983
+ bid.ub.edu | 970
+ www.meirongtv.com | 959
+ (50 rows)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ejde.math.txstate.edu%' limit 5;
+ http://ejde.math.txstate.edu/Volumes/2018/30/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2012/137/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2016/268/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2015/194/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2014/43/abstr.html
+ # plain HTML, not really parse-able
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.arkat-usa.org%' limit 5;
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0013.909
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0007.717
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.p008.158
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0014.216
+ # fixed (embed PDF)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.scielo.br%' limit 5;
+ https://doi.org/10.5935/0034-7280.20200075
+ https://doi.org/10.5935/0004-2749.20200071
+ https://doi.org/10.5935/0034-7280.20200035
+ http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1516-44461999000400014
+ https://doi.org/10.5935/0034-7280.20200047
+ # need recrawls?
+ # then success
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.lcgdbzz.org%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://revistas.uniandes.edu.co%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://scielo.sld.cu%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.egms.de%' limit 5;
+ https://doi.org/10.3205/16dgnc020
+ http://nbn-resolving.de/urn:nbn:de:0183-19degam1126
+ http://www.egms.de/en/meetings/dgpraec2019/19dgpraec032.shtml
+ http://www.egms.de/en/meetings/dkou2019/19dkou070.shtml
+ http://nbn-resolving.de/urn:nbn:de:0183-20nrwgu625
+ # mostly abstracts, don't have PDF versions
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ter-arkhiv.ru%' limit 5;
+ https://doi.org/10.26442/terarkh201890114-47
+ https://doi.org/10.26442/00403660.2019.12.000206
+ https://journals.eco-vector.com/0040-3660/article/download/32246/pdf
+ https://journals.eco-vector.com/0040-3660/article/download/33578/pdf
+ https://doi.org/10.26442/00403660.2019.12.000163
+ # working, needed recrawls (some force re-crawls)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.kitlv-journals.nl%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.cjcnn.org%' limit 5;
+
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.dlib.si%' limit 5;
+ https://srl.si/ojs/srl/article/view/2910
+ https://srl.si/ojs/srl/article/view/3640
+ https://srl.si/ojs/srl/article/view/2746
+ https://srl.si/ojs/srl/article/view/2557
+ https://srl.si/ojs/srl/article/view/2583
+ # fixed? (dlib.si)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.jssm.org%' limit 5;
+ http://www.jssm.org/vol4/n4/8/v4n4-8text.php
+ http://www.jssm.org/vol7/n1/19/v7n1-19text.php
+ http://www.jssm.org/vol9/n3/10/v9n3-10text.php
+ http://www.jssm.org/abstresearcha.php?id=jssm-14-347.xml
+ http://www.jssm.org/vol7/n2/11/v7n2-11text.php
+ # works as an HTML document? otherwise hard to select on PDF link
+
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://filclass.ru%' limit 5;
+ https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism
+ https://filclass.ru/en/archive/2015/42/training-as-an-effective-form-of-preparation-for-the-final-essay
+ https://filclass.ru/en/archive/2020/vol-25-3/didaktizatsiya-literatury-rossijskikh-nemtsev-zanyatie-po-poeme-viktora-klyajna-jungengesprach
+ https://filclass.ru/en/archive/2015/40/the-communicative-behaviour-of-the-russian-intelligentsia-and-its-reflection-in-reviews-as-a-genre-published-in-online-literary-journals-abroad
+ https://filclass.ru/en/archive/2016/46/discoursive-means-of-implication-of-instructive-components-within-the-anti-utopia-genre
+ # fixed
+ # TODO: XXX: re-crawl/ingest
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://microbiologyjournal.org%' limit 5;
+ https://microbiologyjournal.org/the-relationship-between-the-type-of-infection-and-antibiotic-resistance/
+ https://microbiologyjournal.org/antimicrobial-resistant-shiga-toxin-producing-escherichia-coli-isolated-from-ready-to-eat-meat-products-and-fermented-milk-sold-in-the-formal-and-informal-sectors-in-harare-zimbabwe/
+ https://microbiologyjournal.org/emerging-antibiotic-resistance-in-mycoplasma-microorganisms-designing-effective-and-novel-drugs-therapeutic-targets-current-knowledge-and-futuristic-prospects/
+ https://microbiologyjournal.org/microbiological-and-physicochemicalpropertiesofraw-milkproduced-from-milking-to-delivery-to-milk-plant/
+ https://microbiologyjournal.org/association-of-insulin-based-insulin-resistance-with-liver-biomarkers-in-type-2-diabetes-mellitus/
+ # HTML article, no PDF
+ # ... but only sometimes
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.medwave.cl%' limit 5;
+ http://www.medwave.cl/link.cgi/Medwave/Perspectivas/Cartas/6878
+ https://www.medwave.cl/link.cgi/Medwave/Revisiones/RevisionClinica/8037.act
+ http://dx.doi.org/10.5867/medwave.2012.03.5332
+ https://www.medwave.cl/link.cgi/Medwave/Estudios/Casos/7683.act
+ http://www.medwave.cl/link.cgi/Medwave/Revisiones/CAT/5964
+ # HTML article, no PDF
+
+Re-ingest HTML:
+
+ https://fatcat.wiki/container/mafob4ewkzczviwipyul7knndu (DONE)
+ https://fatcat.wiki/container/6rgnsrp3rnexdoks3bxcmbleda (DONE)
+
+Re-ingest PDF:
+
+ doi_prefix:10.5935 (DONE)
+ doi_prefix:10.26442
+
+## More Scielo
+
+More scielo? `doi_prefix:10.5935 in_ia:false`
+
+ http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873
+ # OJS? fixed
+
+ https://revistas.unicentro.br/index.php/repaa/article/view/2667/2240
+ # working, but needed re-crawl
+
+ http://www.rbcp.org.br/details/2804/piezoelectric-preservative-rhinoplasty--an-alternative-approach-for-treating-bifid-nose-in-tessier-no--0-facial-cleft
+
+A few others, mostly now working
+
+## Recent OA DOIs
+
+ fatcat-cli search release 'is_oa:true (type:article-journal OR type:article OR type:paper-conference) !doi_prefix:10.5281 !doi_prefix:10.6084 !doi_prefix:10.48550 !doi_prefix:10.25446 !doi_prefix:10.25384 doi:* date:>2022-06-15 date:<2022-07-15 in_ia:false !publisher_type:big5' --index-json --limit 0 | pv -l > recent_missing_oa.json
+
+ wc -l recent_missing_oa.json
+ 24433
+
+ cat recent_missing_oa.json | jq .doi_prefix -r | sort | uniq -c | sort -nr | head
+ 4968 10.3390
+ 1261 10.1080
+ 687 10.23668
+ 663 10.1021
+ 472 10.1088
+ 468 10.4000
+ 367 10.3917
+ 357 10.1364
+ 308 10.4230
+ 303 10.17863
+
+ cat recent_missing_oa.json | jq .doi_registrar -r | sort | uniq -c | sort -nr
+ 19496 crossref
+ 4836 datacite
+ 101 null
+
+ cat recent_missing_oa.json | jq .publisher_type -r | sort | uniq -c | sort -nr
+ 9575 longtail
+ 8419 null
+ 3861 society
+ 822 unipress
+ 449 oa
+ 448 scielo
+ 430 commercial
+ 400 repository
+ 22 other
+ 7 archive
+
+ cat recent_missing_oa.json | jq .publisher -r | sort | uniq -c | sort -nr | head
+ 4871 MDPI AG
+ 1107 Informa UK (Taylor & Francis)
+ 665 EAG-Publikationen
+ 631 American Chemical Society
+ 451 IOP Publishing
+ 357 The Optical Society
+ 347 OpenEdition
+ 309 CAIRN
+ 308 Schloss Dagstuhl - Leibniz-Zentrum für Informatik
+ 303 Apollo - University of Cambridge Repository
+
+ cat recent_missing_oa.json | jq .container_name -r | sort | uniq -c | sort -nr | head
+ 4908 null
+ 378 Sustainability
+ 327 ACS Omega
+ 289 Optics Express
+ 271 International Journal of Environmental Research and Public Health
+ 270 International Journal of Health Sciences
+ 238 Sensors
+ 223 International Journal of Molecular Sciences
+ 207 Molecules
+ 193 Proceedings of the National Academy of Sciences of the United States of America
+
+ cat recent_missing_oa.json \
+ | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+ | wc -l
+ 16558
+
+ cat recent_missing_oa.json | rg -i mdpi | shuf -n10 | jq .doi -r
+ 10.3390/molecules27144419
+ => was a 404
+ => recrawl was successful
+ 10.3390/math10142398
+ => was a 404
+ 10.3390/smartcities5030039
+ => was a 404
+
+Huh, we need to re-try/re-crawl MDPI URLs every week or so? Or special-case this situation.
+Could be just a fatcat script, or a sandcrawler query.
+
+ cat recent_missing_oa.json \
+ | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+ | shuf -n10 | jq .doi -r
+
+ https://doi.org/10.18452/24860
+ => success (just needed quarterly retry?)
+ => b8c6c86aebd6cd2d85515441bbce052bcff033f2 (not in fatcat.wiki)
+ => current status is "bad-redirect"
+ https://doi.org/10.26181/20099540.v1
+ => success
+ => 3f9b1ff2a09f3ea9051dbbef277579e8a0b4df30
+ => this is figshare, and versioned. PDF was already attached to another DOI: https://doi.org/10.26181/20099540
+ https://doi.org/10.4230/lipics.sea.2022.22
+ => there is a bug resulting in trailing slash in `citation_pdf_url`
+ => fixed as a quirks mode
+ => emailed to report
+ https://doi.org/10.3897/aca.5.e89679
+ => success
+ => e6fd1e066c8a323dc56246631748202d5fb48808
+ => current status is 'bad-redirect'
+ https://doi.org/10.1103/physrevd.105.115035
+ => was 404
+ => success after force-recrawl of the terminal URL (not base URL)
+ https://doi.org/10.1155/2022/4649660
+ => was 404
+ => success after force-recrawl (of base_url)
+ https://doi.org/10.1090/spmj/1719
+ => paywall (not actually OA)
+ => https://fatcat.wiki/container/x6jfhegb3fbv3bcbqn2i3espiu is on Szczepanski list, but isn't all OA?
+ https://doi.org/10.1139/as-2022-0011
+ => was no-pdf-link
+ => fixed fulltext URL extraction
+ => still needed to re-crawl terminal PDF link? hrm
+ https://doi.org/10.31703/grr.2022(vii-ii).02
+ => was no-pdf-link
+ => fixed! success
+ https://doi.org/10.1128/spectrum.00154-22
+ => was 404
+ => now repeatably 503, via SPN
+ https://doi.org/10.51601/ijersc.v3i3.393
+ => 503 server error
+ https://doi.org/10.25416/ntr.20137379.v1
+ => is figshare
+ => docx (not PDF)
+ https://doi.org/10.25394/pgs.20263698.v1
+ => figshare
+ => embargo'd
+ https://doi.org/10.24850/j-tyca-14-4-7
+ => was no-pdf-link
+ => docs.google.com/viewer (!)
+ => now handle this (success)
+ https://doi.org/10.26267/unipi_dione/1832
+ => was bad-redirect
+ => success
+ https://doi.org/10.25560/98019
+ => body-too-large
+ => also, PDF metadata fails to parse
+ => is actually like 388 MByte
+ https://doi.org/10.14738/abr.106.12511
+ => max-hops-exceeded
+ => bumped max-hops from 6 to 8
+ => then success (via google drive)
+ https://doi.org/10.24350/cirm.v.19933803
+ => video, not PDF
+ https://doi.org/10.2140/pjm.2022.317.67
+ => link-loop
+ => not actually OA
+ https://doi.org/10.26265/polynoe-2306
+ => was bad-redirect
+ => now success
+ https://doi.org/10.3389/fpls.2022.826875
+ => frontiers
+ => was terminal-bad-status (403)
+ => success on retry (not sure why)
+ => maybe this is also a date-of-publication thing?
+ => not sure all these should be retried though
+ https://doi.org/10.14198/medcom.22240
+ => was terminal-bad-status (404)
+ => force-recrawl resulted in an actual landing page, but still no-pdf-link
+ => but actual PDF is a real 404, it seems. oh well
+ https://doi.org/10.31729/jnma.7579
+ => no-capture
+ https://doi.org/10.25373/ctsnet.20146931.v2
+ => figshare
+ => video, not document or PDF
+ https://doi.org/10.1007/s42600-022-00224-0
+ => not yet crawled/attempted (!)
+ => springer
+ => not actually OA
+ https://doi.org/10.37391/ijeer.100207
+ => some upstream issue (server not found)
+ https://doi.org/10.1063/5.0093946
+ => aip.scitation.org, is actually OA (can download in browser)
+ => cookie trap?
+ => redirect-loop (seems like a true redirect loop)
+ => retrying the terminal PDF URL seems to have worked
+ https://doi.org/10.18502/jchr.v11i2.9998
+ => no actual fulltext on publisher site
+ https://doi.org/10.1128/spectrum.01144-22
+ => this is a 503 error, even after retrying. weird!
+
+DONE: check `publisher_type` in chocula for:
+- "MDPI AG"
+- "Informa UK (Taylor & Francis)"
+
+ cat recent_missing_oa.json | jq '[.publisher, .publisher_type]' -c | sort | uniq -c | sort -nr | head -n40
+ 4819 ["MDPI AG","longtail"]
+ 924 ["Informa UK (Taylor & Francis)",null]
+ 665 ["EAG-Publikationen",null]
+ 631 ["American Chemical Society","society"]
+ 449 ["IOP Publishing","society"]
+ 357 ["The Optical Society","society"]
+ 336 ["OpenEdition","oa"]
+ 309 ["CAIRN","repository"]
+ 308 ["Schloss Dagstuhl - Leibniz-Zentrum für Informatik",null]
+ 303 ["Apollo - University of Cambridge Repository",null]
+ 292 ["Springer (Biomed Central Ltd.)",null]
+ 275 ["Purdue University Graduate School",null]
+ 270 ["Suryasa and Sons","longtail"]
+ 257 ["La Trobe",null]
+ 216 ["Frontiers Media SA","longtail"]
+ 193 ["Proceedings of the National Academy of Sciences","society"]
+ 182 ["Informa UK (Taylor & Francis)","longtail"]
+ 176 ["American Physical Society","society"]
+ 168 ["Institution of Electrical Engineers","society"]
+ 166 ["Oxford University Press","unipress"]
+ 153 ["Loughborough University",null]
+
+ chocula mostly seems to set these correctly. is the issue that the chocula
+ computed values aren't coming through or getting updated? probably. both
+ the release (from container) metadata update; and chocula importer not
+ doing updates based on this field; and some old/incorrect values.
+
+ did some cleanups of specific containers, and next chocula update should
+ result in a bunch more `publisher_type` getting populated on older
+ containers
+
+
+TODO: verify URLs are actualy URLs... somewhere? in the ingest pipeline
+
+TODO: fatcat: don't ingest figshare "work" DOIs, only the "versioned" ones (?)
+ doi_prefix:10.26181
+
+WIP: sandcrawler: regularly (weekly?) re-try 404 errors (the terminal URL, not the base url?) (or, some kind of delay?)
+ doi_prefix:10.3390 (MDPI)
+ doi_prefix:10.1103
+ doi_prefix:10.1155
+
+DONE: simply re-ingest all:
+ doi_prefix:10.4230
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf query 'doi_prefix:10.4230'
+ # Counter({'ingest_request': 2096, 'elasticsearch_release': 2096, 'estimate': 2096, 'kafka': 2096})
+ container_65lzi3vohrat5nnymk3dqpoycy
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 65lzi3vohrat5nnymk3dqpoycy
+ # Counter({'ingest_request': 187, 'elasticsearch_release': 187, 'estimate': 187, 'kafka': 187})
+ container_5vp2bio65jdc3blx6rfhp3chde
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 5vp2bio65jdc3blx6rfhp3chde
+ # Counter({'ingest_request': 83, 'elasticsearch_release': 83, 'estimate': 83, 'kafka': 83})
+
+DONE: verify and maybe re-ingest all:
+ is_oa:true publisher:"Canadian Science Publishing" in_ia:false
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --allow-non-oa --ingest-type pdf --force-recrawl query 'year:>2010 is_oa:true publisher:"Canadian Science Publishing" in_ia:false !journal:print'
+ # Counter({'ingest_request': 1041, 'elasticsearch_release': 1041, 'estimate': 1041, 'kafka': 1041})
+
+
+## Re-Ingest bad-redirect, max-hops-exceeded, and google drive
+
+Similar to `redirect-loop`:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'bad-redirect'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json';
+ # COPY 100011
+ # after first run: COPY 5611
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'max-hops-exceeded'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json';
+ # COPY 3546
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.hit is false
+ AND ingest_file_result.terminal_url like 'https://docs.google.com/viewer%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json';
+ # COPY 1082
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json
+
+ cat /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ cat /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ cat /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ # DONE
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md
new file mode 100644
index 0000000..74aeb8d
--- /dev/null
+++ b/notes/ingest/2022-07-19_dblp.md
@@ -0,0 +1,50 @@
+
+Cross-posting from fatcat bulk metadata update/ingest.
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 631k 0:00:11 [54.0k/s]
+
+
+## Post-Crawl Stats
+
+This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
+2022-09-06:
+
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'dblp'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+-----------------------+--------
+ pdf | success | 305142
+ pdf | no-pdf-link | 192683
+ pdf | no-capture | 42634
+ pdf | terminal-bad-status | 38041
+ pdf | skip-url-blocklist | 31055
+ pdf | link-loop | 9263
+ pdf | wrong-mimetype | 4545
+ pdf | redirect-loop | 3952
+ pdf | empty-blob | 2705
+ pdf | wayback-content-error | 834
+ pdf | wayback-error | 294
+ pdf | petabox-error | 202
+ pdf | blocked-cookie | 155
+ pdf | cdx-error | 115
+ pdf | body-too-large | 66
+ pdf | bad-redirect | 19
+ pdf | timeout | 7
+ pdf | bad-gzip-encoding | 4
+ (18 rows)
+
+That is quite a lot of `no-pdf-link`, might be worth doing a random sample
+and/or re-ingest. And a chunk of `no-capture` to retry.
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+ export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+ # 9.72M 0:36:28 [4.44k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # 9.72M 0:17:04 [9.49k/s]
+ # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3165539
+ pdf | | 2078874
+ html | | 1547698
+ html | wrong-scope | 1114332
+ pdf | no-pdf-link | 517261
+ html | success | 388376
+ html | unknown-scope | 242044
+ pdf | no-capture | 179030
+ pdf | terminal-bad-status | 174741
+ html | no-capture | 155323
+ pdf | null-body | 129267
+ pdf | redirect-loop | 127136
+ html | html-resource-no-capture | 117275
+ html | null-body | 100296
+ pdf | blocked-cookie | 71093
+ html | redirect-loop | 65519
+ html | terminal-bad-status | 64856
+ html | blocked-cookie | 64095
+ html | spn2-backoff | 55173
+ pdf | link-loop | 27440
+ html | wrong-mimetype | 26016
+ html | wayback-content-error | 20109
+ xml | | 13624
+ pdf | wrong-mimetype | 8411
+ xml | success | 6899
+ html | petabox-error | 6199
+ html | wayback-error | 5269
+ html | spn2-cdx-lookup-failure | 4635
+ html | spn2-recent-capture | 4527
+ xml | null-body | 2353
+ (30 rows)
+
+## Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+ # COPY 3962331
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+ # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 789988 www.mdpi.com
+ 318142 www.frontiersin.org
+ 226316 link.springer.com
+ 204429 www.scielo.br
+ 201175 www.sciencedirect.com
+ 72852 ieeexplore.ieee.org
+ 68983 dx.doi.org
+ 33286 www.dovepress.com
+ 26020 elifesciences.org
+ 23838 www.cetjournal.it
+ 21102 mab-online.nl
+ 20242 www.revistas.usp.br
+ 16564 periodicos.uem.br
+ 15710 journals.openedition.org
+ 14514 dergipark.org.tr
+ 14072 apcz.umk.pl
+ 13924 ojs.minions.amsterdam
+ 13717 bmgn-lchr.nl
+ 13512 ojstest.minions.amsterdam
+ 10440 journals.asm.org
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # Done
+
+## Stats Again
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 4704006
+ html | wrong-scope | 1761227
+ html | success | 778165
+ pdf | no-pdf-link | 759805
+ html | no-capture | 382080
+ html | unknown-scope | 313391
+ html | html-resource-no-capture | 292953
+ pdf | no-capture | 290311
+ pdf | terminal-bad-status | 271776
+ pdf | null-body | 129267
+ pdf | blocked-cookie | 108491
+ html | terminal-bad-status | 103014
+ html | null-body | 100296
+ html | blocked-cookie | 88533
+ pdf | | 81517
+ pdf | skip-url-blocklist | 76443
+ html | spn2-backoff | 50615
+ pdf | link-loop | 45516
+ html | wrong-mimetype | 33525
+ html | wayback-content-error | 25535
+ pdf | empty-blob | 21431
+ pdf | redirect-loop | 19795
+ html | petabox-error | 18291
+ html | empty-blob | 14391
+ pdf | wrong-mimetype | 14084
+ html | redirect-loop | 12856
+ xml | success | 10381
+ xml | no-capture | 10008
+ html | skip-url-blocklist | 3294
+ html | cdx-error | 3275
+ (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.
diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md
new file mode 100644
index 0000000..415f23b
--- /dev/null
+++ b/notes/ingest/2022-07_targeted.md
@@ -0,0 +1,140 @@
+
+Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
+
+ export PATCHDATE=2022-07-29
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
+ => COPY 3524573
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ => 3.11M 0:01:08 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 624948 doi.org
+ 382492 www.jstage.jst.go.jp
+ 275087 www.mdpi.com
+ 157134 www.persee.fr
+ 108979 www.sciencedirect.com
+ 94375 www.scielo.br
+ 50834 onlinelibrary.wiley.com
+ 49991 journals.lww.com
+ 30354 www.frontiersin.org
+ 27963 doaj.org
+ 27058 www.e-periodica.ch
+ 24147 dl.acm.org
+ 23389 aclanthology.org
+ 22086 www.research-collection.ethz.ch
+ 21589 medien.die-bonn.de
+ 18866 www.ingentaconnect.com
+ 18583 doi.nrct.go.th
+ 18271 repositories.lib.utexas.edu
+ 17634 hdl.handle.net
+ 16366 archives.datapages.com
+ 15146 cgscholar.com
+ 13987 dl.gi.de
+ 13188 www.degruyter.com
+ 12503 ethos.bl.uk
+ 12304 preprints.jmir.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+ => done
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+
+## Re-Ingest
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
+ => 3.52M 0:01:37 [36.2k/s]
+
+Ingest:
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md
new file mode 100644
index 0000000..ac7c68f
--- /dev/null
+++ b/notes/ingest/2022-09_oaipmh.md
@@ -0,0 +1,397 @@
+
+Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921>
+
+I updated the transform script to block some additional domains.
+
+
+## Prep
+
+Fetch the snapshot:
+
+ cd /srv/sandcrawler/tasks/
+ wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst
+
+Transform to ingest requests:
+
+ cd /srv/sandcrawler/src/python
+ git log | head -n1
+ # commit dfd4605d84712eccb95a63e50b0bcb343642b433
+
+ pipenv shell
+ zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz
+ # 16.1M 1:01:02 [4.38k/s]
+
+Curious about types, though this would probably be handled at fatcat ingest
+time:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt
+
+ head oai_type_counts.txt -n30
+ 5623867 info:eu-repo/semantics/article
+ 5334928 info:eu-repo/semantics/publishedVersion
+ 3870359 text
+ 1240225 Text
+ 829169 Article
+ 769849 NonPeerReviewed
+ 665700 PeerReviewed
+ 648740 Peer-reviewed Article
+ 547857 article
+ 482906 info:eu-repo/semantics/bachelorThesis
+ 353814 Thesis
+ 329269 Student thesis
+ 262650 info:eu-repo/semantics/conferenceObject
+ 185354 Journal articles
+ 162021 info:eu-repo/semantics/doctoralThesis
+ 152079 Journal Article
+ 150226 Research Article
+ 130217 Conference papers
+ 127255 Artículo revisado por pares
+ 124243 Newspaper
+ 123908 ##rt.metadata.pkp.peerReviewed##
+ 123309 Photograph
+ 122981 info:eu-repo/semantics/masterThesis
+ 116719 Book
+ 108946 Image
+ 108216 Report
+ 107946 Other
+ 103562 masterThesis
+ 103038 info:eu-repo/semantics/other
+ 101404 StillImage
+ [...]
+
+And formats:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt
+
+ head -n 20 oai_format_counts.txt
+ 11151928 application/pdf
+ 677413 text
+ 561656 text/html
+ 498518 image/jpeg
+ 231219 Text
+ 193638 text/xml
+ 147214 Image
+ 117073 image/jpg
+ 110872 pdf
+ 91323 image/tiff
+ 76948 bib
+ 75393 application/xml
+ 70244 Digitized from 35 mm. microfilm.
+ 68206 mods
+ 59227 PDF
+ 57677 application/epub+zip
+ 57602 application/octet-stream
+ 52072 text/plain
+ 51620 application/msword
+ 47227 audio/mpeg
+
+Also, just overall size (number of records):
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l
+ # 20,840,301
+
+Next load in to sandcrawler DB:
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request -
+
+ Traceback (most recent call last):
+ File "./persist_tool.py", line 311, in <module>
+ main()
+ File "./persist_tool.py", line 307, in main
+ args.func(args)
+ File "./persist_tool.py", line 119, in run_ingest_request
+ pusher.run()
+ File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run
+ self.worker.push_batch(batch)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values
+ cur.execute(b''.join(parts))
+ psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx"
+ DETAIL: Index row references tuple (6893121,3) in relation "ingest_request".
+ HINT: Values larger than 1/3 of a buffer page cannot be indexed.
+ Consider a function index of an MD5 hash of the value, or use full text indexing.
+ 15.7M 0:41:48 [6.27k/s]
+
+Darn, this means we won't get reasonable stats about how many rows were
+inserted/updated.
+
+Patched the persist tool to skip very long URLs, and ran again (backwards, just
+URLs which didn't get inserted already):
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \
+ | tac \
+ | head -n1000000 \
+ | pv -l \
+ | ./persist_tool.py ingest-request -
+ # 1.00M 0:03:04 [5.41k/s]
+ # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0})
+
+Status of just the new lines:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+---------
+ | 6398455
+ success | 540219
+ no-pdf-link | 41316
+ link-loop | 23871
+ no-capture | 11350
+ redirect-loop | 8315
+ wrong-mimetype | 2394
+ terminal-bad-status | 1540
+ null-body | 1038
+ cdx-error | 272
+ empty-blob | 237
+ petabox-error | 213
+ wayback-error | 186
+ blocked-cookie | 107
+ timeout | 47
+ wayback-content-error | 26
+ spn2-cdx-lookup-failure | 21
+ skip-url-blocklist | 16
+ spn2-backoff | 15
+ body-too-large | 13
+ (20 rows)
+
+
+## Bulk Ingest
+
+Should already have filtered domains/prefixes in transform script, so not
+including filters here.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json';
+ # COPY 6398455
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json
+ # 6.40M 0:02:18 [46.2k/s]
+
+ cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # DONE
+
+Expect this ingest to take a week or so.
+
+Then, run stats again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3617175
+ success | 2775036
+ no-pdf-link | 449298
+ link-loop | 74260
+ terminal-bad-status | 47819
+ wrong-mimetype | 20195
+ redirect-loop | 18197
+ empty-blob | 12127
+ cdx-error | 3038
+ skip-url-blocklist | 2630
+ wayback-error | 2599
+ petabox-error | 2354
+ wayback-content-error | 1617
+ blocked-cookie | 1293
+ null-body | 1038
+ body-too-large | 670
+ | 143
+ bad-gzip-encoding | 64
+ timeout | 47
+ spn2-cdx-lookup-failure | 20
+ (20 rows)
+
+
+## Crawl Seedlist
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'terminal-bad-status'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'timeout'
+ OR ingest_file_result.status = 'wayback-content-error'
+ )
+ ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json';
+ => COPY 3692846
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json
+ => 3.69M 0:01:19 [46.6k/s]
+
+This will be used for re-ingest later. For now, extract URLs:
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | jq .base_url -r \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ => 3.66M 0:00:59 [61.8k/s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | rg '"terminal_url"' \
+ | jq -r .result.terminal_url \
+ | rg -v ^null$ \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ => 0.00 0:00:05 [0.00 /s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | awk '{print "F+ " $1}' \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+What domains are we crawling?
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | sort -u -S 4G \
+ | cut -d/ -f3 \
+ | sort \
+ | uniq -c \
+ | sort -nr \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+
+ head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+ 91899 raco.cat
+ 70116 islandora.wrlc.org
+ 68708 urn.kb.se
+ 63726 citeseerx.ist.psu.edu
+ 50370 publications.rwth-aachen.de
+ 44885 urn.nsk.hr
+ 38429 server15795.contentdm.oclc.org
+ 33041 periodicos.ufpb.br
+ 32519 nbn-resolving.org
+ 31990 www.ajol.info
+ 24745 hal.archives-ouvertes.fr
+ 22569 id.nii.ac.jp
+ 17239 tilburguniversity.on.worldcat.org
+ 15873 dspace.nbuv.gov.ua
+ 15436 digitalcommons.wustl.edu
+ 14885 www.iiste.org
+ 14623 www.manchester.ac.uk
+ 14033 nbn-resolving.de
+ 13999 opus4.kobv.de
+ 13689 www.redalyc.org
+
+Sizes:
+
+ wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ 0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+
+Copy seedlist to crawler:
+
+ # as regular user
+ scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp
+
+## Post-Crawl Bulk Ingest
+
+ # ran 2022-11-16, after crawl cleanup
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -----------------------+---------
+ success | 4721164 +1,946,128
+ no-pdf-link | 1116290
+ no-capture | 673939
+ terminal-bad-status | 232217
+ link-loop | 148544
+ wrong-mimetype | 68841
+ redirect-loop | 26262
+ empty-blob | 17759
+ cdx-error | 6570
+ blocked-cookie | 4026
+ blocked-wall | 3054
+ skip-url-blocklist | 2924
+ body-too-large | 2404
+ bad-redirect | 1565
+ wayback-error | 1320
+ petabox-error | 1083
+ null-body | 1038
+ wayback-content-error | 264
+ bad-gzip-encoding | 150
+ | 143
+ (20 rows)
+
diff --git a/notes/ingest_domains.txt b/notes/ingest_domains.txt
new file mode 100644
index 0000000..ae06272
--- /dev/null
+++ b/notes/ingest_domains.txt
@@ -0,0 +1,294 @@
+
+## Queries to find broken domains
+
+Top domains with failed ingests:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+Status overview for a particular domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT domain, terminal_status_code, COUNT((domain, terminal_status_code))
+ FROM (SELECT terminal_status_code, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ AND t1.terminal_status_code is not null
+ GROUP BY domain, terminal_status_code
+ ORDER BY COUNT DESC;
+
+Sample recent failures:
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%osapublishing.org%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+
+## Failing
+
+www.osapublishing.org
+
+ this publisher (The Optical Society) is systemically using a CAPTCHA to
+ gate access to PDFs. bummer! could ask them to white-list?
+
+ has citation_pdf_url, so that isn't an issue
+
+ status: "no-pdf-link"
+ hops:
+ "https://doi.org/10.1364/optica.6.000798",
+ "https://www.osapublishing.org/viewmedia.cfm?uri=optica-6-6-798&seq=0"
+ "https://www.osapublishing.org/captcha/?guid=830CEAB5-09BD-6140-EABD-751200C78B1C"
+
+ domain | status | count
+ -----------------------+---------------------+-------
+ www.osapublishing.org | no-capture | 16680
+ www.osapublishing.org | no-pdf-link | 373
+ www.osapublishing.org | redirect-loop | 19
+ www.osapublishing.org | terminal-bad-status | 5
+ www.osapublishing.org | cdx-error | 1
+ www.osapublishing.org | wrong-mimetype | 1
+ www.osapublishing.org | spn-error | 1
+ www.osapublishing.org | success | 1
+ www.osapublishing.org | wayback-error | 1
+ (9 rows)
+
+www.persee.fr
+
+ Seems to be mostly blocking or rate-limiting?
+
+ domain | status | count
+ ---------------+-------------------------------------+-------
+ www.persee.fr | no-capture | 37862
+ www.persee.fr | terminal-bad-status | 3134
+ www.persee.fr | gateway-timeout | 2828
+ www.persee.fr | no-pdf-link | 431
+ www.persee.fr | spn-error | 75
+ www.persee.fr | redirect-loop | 23
+ www.persee.fr | success | 8
+ www.persee.fr | spn2-error | 2
+ www.persee.fr | spn2-error:soft-time-limit-exceeded | 1
+ www.persee.fr | wrong-mimetype | 1
+ (10 rows)
+
+journals.openedition.org
+
+ PDF access is via "freemium" subscription. Get redirects to:
+
+ https://auth.openedition.org/authorized_ip?url=http%3A%2F%2Fjournals.openedition.org%2Fnuevomundo%2Fpdf%2F61053
+
+ Content is technically open access (HTML and license; for all content?),
+ but can't be crawled as PDF without subscription.
+
+ domain | status | count
+ --------------------------+-------------------------+-------
+ journals.openedition.org | redirect-loop | 29587
+ journals.openedition.org | success | 6821
+ journals.openedition.org | no-pdf-link | 1507
+ journals.openedition.org | no-capture | 412
+ journals.openedition.org | wayback-error | 32
+ journals.openedition.org | wrong-mimetype | 27
+ journals.openedition.org | terminal-bad-status | 13
+ journals.openedition.org | spn2-cdx-lookup-failure | 4
+ journals.openedition.org | spn-remote-error | 1
+ journals.openedition.org | null-body | 1
+ journals.openedition.org | cdx-error | 1
+ (11 rows)
+
+journals.lww.com
+
+ no-pdf-link
+
+ domain | status | count
+ ------------------+----------------+-------
+ journals.lww.com | no-pdf-link | 11668
+ journals.lww.com | wrong-mimetype | 131
+ (2 rows)
+
+ doi prefix: 10.1097
+
+ <meta name="wkhealth_pdf_url" content="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf" />
+ data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+
+ Some weird thing going on, maybe they are blocking-via-redirect based on
+ our User-Agent? Seems like wget works, so funny that they don't block that.
+
+musewide.aip.de
+
+ no-pdf-link
+
+koreascience.or.kr | no-pdf-link | 8867
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%osapublishing.org%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+www.cairn.info | link-loop | 8717
+
+easy.dans.knaw.nl | no-pdf-link | 8262
+scielo.conicyt.cl | no-pdf-link | 7925
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'scielo.conicyt.cl'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%scielo.conicyt.cl%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+
+ domain | status | count
+ -------------------+---------------------+-------
+ scielo.conicyt.cl | no-pdf-link | 7926
+ scielo.conicyt.cl | success | 4972
+ scielo.conicyt.cl | terminal-bad-status | 1474
+ scielo.conicyt.cl | wrong-mimetype | 6
+ scielo.conicyt.cl | no-capture | 4
+ scielo.conicyt.cl | null-body | 1
+
+
+ pdf | https://doi.org/10.4067/s0370-41061980000300002 | 2020-02-22 23:55:56.235822+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0370-41061980000300002&lng=en&nrm=iso&tlng=en | 20200212201727 | 200 |
+ pdf | https://doi.org/10.4067/s0718-221x2019005000201 | 2020-02-22 23:01:49.070104+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-221X2019005000201&lng=en&nrm=iso&tlng=en | 20200214105308 | 200 |
+ pdf | https://doi.org/10.4067/s0717-75262011000200002 | 2020-02-22 22:49:36.429717+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-75262011000200002&lng=en&nrm=iso&tlng=en | 20200211205804 | 200 |
+ pdf | https://doi.org/10.4067/s0717-95022006000400029 | 2020-02-22 22:33:07.761766+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-95022006000400029&lng=en&nrm=iso&tlng=en | 20200209044048 | 200 |
+
+ These seem, on retry, like success? Maybe previous was a matter of warc/revisit not getting handled correctly?
+
+ pdf | https://doi.org/10.4067/s0250-71611998007100009 | 2020-02-22 23:57:16.481703+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0250-71611998007100009&lng=en&nrm=iso&tlng=en | 20200212122939 | 200 |
+ pdf | https://doi.org/10.4067/s0716-27902005020300006 | 2020-02-22 23:56:01.247616+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0716-27902005020300006&lng=en&nrm=iso&tlng=en | 20200214192151 | 200 |
+ pdf | https://doi.org/10.4067/s0718-23762005000100015 | 2020-02-22 23:53:55.81526+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-23762005000100015&lng=en&nrm=iso&tlng=en | 20200214173237 | 200 |
+
+ Look like web/xml only.
+
+ TODO: XML ingest (and replay?) support. These are as "<article>", not sure if that is JATS or what.
+
+www.kci.go.kr | no-pdf-link | 6842
+www.m-hikari.com | no-pdf-link | 6763
+cshprotocols.cshlp.org | no-pdf-link | 6553
+www.bibliotekevirtual.org | no-pdf-link | 6309
+data.hpc.imperial.ac.uk | no-pdf-link | 6071
+projecteuclid.org | link-loop | 5970
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'projecteuclid.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%projecteuclid.org%'
+ AND status = 'link-loop'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ -------------------+-------------------------+-------
+ projecteuclid.org | link-loop | 5985
+ projecteuclid.org | success | 26
+ projecteuclid.org | wayback-error | 26
+ projecteuclid.org | wrong-mimetype | 17
+ projecteuclid.org | spn2-cdx-lookup-failure | 4
+ projecteuclid.org | other-mimetype | 4
+ projecteuclid.org | no-capture | 3
+ projecteuclid.org | terminal-bad-status | 2
+ projecteuclid.org | spn2-error:job-failed | 1
+ projecteuclid.org | spn-remote-error | 1
+ (10 rows)
+
+ Doing a cookie check and redirect.
+
+ TODO: brozzler behavior to "click the link" instead?
+
+www.scielo.br | no-pdf-link | 5823
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'www.scielo.br'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%www.scielo.br%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ ---------------+-------------------------+-------
+ www.scielo.br | success | 35150
+ www.scielo.br | no-pdf-link | 5839
+ www.scielo.br | terminal-bad-status | 429
+ www.scielo.br | no-capture | 189
+ www.scielo.br | wrong-mimetype | 7
+ www.scielo.br | spn2-cdx-lookup-failure | 2
+ (6 rows)
+
+ Seems to just be the subset with no PDFs.
+
+get.iedadata.org | no-pdf-link | 5822
+www.pdcnet.org | no-pdf-link | 5798
+publications.rwth-aachen.de | no-pdf-link | 5323
+www.sciencedomain.org | no-pdf-link | 5231
+medicalforum.ch | terminal-bad-status | 4574
+jrnl.nau.edu.ua | link-loop | 4145
+ojs.academypublisher.com | no-pdf-link | 4017
+
+## MAG bulk ingest
+
+- dialnet.unirioja.es | redirect-loop | 240967
+ dialnet.unirioja.es | terminal-bad-status | 20320
+ => may be worth re-crawling via heritrix?
+- agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639
+ => and other *.onlinelibrary.wiley.com
+- www.researchgate.net | redirect-loop | 42859
+- www.redalyc.org:9081 | no-pdf-link | 10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+- papers.ssrn.com | redirect-loop | 27328
+ => blocking is pretty aggressive, using cookies or referrer or something.
+ maybe a brozzler behavior would work, but doesn't currently
+
+## Out of Scope
+
+Datasets only?
+
+- plutof.ut.ee
+- www.gbif.org
+- doi.pangaea.de
+- www.plate-archive.org
+
+Historical non-paper content:
+
+- dhz.uni-passau.de (newspapers)
+- digital.ucd.ie (irish historical)
+
+Mostly datasets (some PDF content):
+
+- *.figshare.com
+- zenodo.com
+- data.mendeley.com
diff --git a/notes/possible_ingest_targets.txt b/notes/possible_ingest_targets.txt
new file mode 100644
index 0000000..fcdc3e4
--- /dev/null
+++ b/notes/possible_ingest_targets.txt
@@ -0,0 +1,15 @@
+
+- all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/tasks/2020-10-21_pdfextract_holes.md b/notes/tasks/2020-10-21_pdfextract_holes.md
new file mode 100644
index 0000000..c0bb65e
--- /dev/null
+++ b/notes/tasks/2020-10-21_pdfextract_holes.md
@@ -0,0 +1,74 @@
+
+Realized I had not enabled persisting of PDF extraction results (thumbnail,
+text) in ingest worker when added over the summer. So now need to run a
+catch-up. This applied to both "live" and "bulk" ingest.
+
+## `cdx` / `ingest` / `grobid` catch-up
+
+First, re-run extraction for cases where we did an ingest, and grobid ran
+successfully, and we have a CDX row, but no `pdf_meta`:
+
+ -- this is a slow query
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'
+ WITH NULL '';
+ => 19,676,116
+
+Wow, that is a lot. Many from recent OAI-PMH and OA crawls, presumably.
+
+ cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And again, after a couple partitions got hung up:
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json'
+ WITH NULL '';
+
+
+ cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+ => 562k 0:00:16 [34.6k/s]
+
+## `petabox` / `grobid` catch-up
+
+These didn't all seem to extract correctly before after 1.5m rows, there will
+still 900k unprocessed. Trying again.
+
+ COPY (
+ SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+ FROM grobid
+ LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE petabox.sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-11-04.json'
+ WITH NULL '';
+
+ cat /grande/snapshots/dump_unextracted_pdf_petabox.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+## `cdx` / `grobid` catch-up
+
+Next will be to process PDFs with GROBID and CDX but no ingest.
+
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
new file mode 100644
index 0000000..cd8176e
--- /dev/null
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -0,0 +1,70 @@
+
+Want to dump a URL list to share with partners, filtered to content we think is
+likely to be scholarly.
+
+Columns to include:
+
+- original URL
+- capture timestamp
+- SHA1
+
+## Stats Overview
+
+file_meta table, mimetype=application/pdf: 173,816,433
+
+cdx table, mimetype=application/pdf: 131,346,703
+
+ingest_file_result table, pdf, success: 66,487,928
+
+## Ingested PDF URLs
+
+"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also?
+
+ COPY (
+ SELECT
+ base_url as start_url,
+ terminal_url as pdf_url,
+ terminal_dt as pdf_url_timestamp,
+ terminal_sha1hex as pdf_sha1hex
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv'
+ WITH NULL '';
+ => 77,892,849
+
+## CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.url as pdf_url,
+ cdx.datetime as pdf_url_timestamp,
+ cdx.sha1hex as pdf_sha1hex
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv'
+ WITH NULL '';
+ => 147,837,935
+
+## Processed web PDFs
+
+"Parsed web PDFs": `file_meta`, left join CDX
+
+(didn't do this one)
+
+---
+
+Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09>
diff --git a/notes/tasks/2021-10-29_crossref_refs_backfill.md b/notes/tasks/2021-10-29_crossref_refs_backfill.md
new file mode 100644
index 0000000..94eefec
--- /dev/null
+++ b/notes/tasks/2021-10-29_crossref_refs_backfill.md
@@ -0,0 +1,235 @@
+
+The current sandcrawler-db crossref table was backfilled from a 2021-01
+snapshot, and has not been updated since.
+
+Would like to use the existing fatcat Kafka feed to keep the crossref table up
+to date, and also backfill in GROBID reference parsing of all `unstructured`
+references.
+
+Current plan is:
+
+1. use kafkacat CLI to dump crossref Kafka topic, from the begining of 2021 up
+ to some recent date
+2. use `persist_tool.py`, with a large batch size (200?) to backfill this dump
+ into sandcrawler-db. this will update some rows multiple times (if there
+ have been updates)
+3. dump the full crossref table, as a point-in-time snapshot
+4. filter to crossref records that have `unstrutured` references in them (at
+ all)
+5. use `grobid_tool.py` with `parallel` to batch process references
+6. backfill these refs using a simple SQL COPY statement
+7. deploy crossref persist worker, with ref updates on, and roll the consumer
+ group back to date of dump
+8. wait for everything to catch up
+
+
+## Commands
+
+Get a timestamp in milliseconds:
+
+ 2021-01-01 is:
+ 1609488000 in unix time (seconds)
+ 1609488000000 in miliseconds
+
+Hrm, oldest messages seem to actually be from 2021-04-28T19:21:10Z though. Due
+to topic compaction? Yup, we have a 180 day compaction policy on that topic,
+probably from when kafka space was tight. Oh well!
+
+Updated retention for this topic to `46656000000` (~540 days, ~18 months) using
+`kafka-manager` web app.
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t fatcat-prod.api-crossref -o s@1609488000000 \
+ | pv -l \
+ | gzip \
+ > crossref_feed_start20210428_end20211029.json.gz
+
+This resulted in ~36 million rows, 46GB.
+
+`scp` that around, then run persist on `sandcrawler-db`:
+
+ # in pipenv, as sandcrawler user
+ # manually edited to set batch size to 200
+ zcat /srv/sandcrawler/tasks/crossref_feed_start20210428_end20211029.json.gz \
+ | pv -l \
+ | ./persist_tool.py crossref -
+ => 36.8M 11:02:43 [ 925 /s]
+
+With a single thread, the persist process runs at about 1,000 rows/sec, which
+works out to about 10 hours for 36 million rows.
+
+At the start of this process, total PostgreSQL database size is 832.21G. At the
+end, 902.51G. Have not run a `VACUUM ALL` or anything like that.
+
+Query to dump crossref rows which have any refs and compress output with pigz:
+
+ # dump_crossref.sql
+ COPY (
+ SELECT record
+ FROM crossref
+ WHERE record::jsonb @? '$.reference[*].unstructured'
+ -- LIMIT 5
+ )
+ TO STDOUT
+ WITH NULL '';
+
+ # 'sed' required because of double quote escaping in postgresql output::
+ # https://stackoverflow.com/questions/29869983/postgres-row-to-json-produces-invalid-json-with-double-escaped-quotes/29871069
+ # 'rg' filter is just being conservative
+
+ # XXX: next time add to the pipeline: rg -v "\\\\"
+ # or, find some way to filter/transform this kind of SQL export better?
+ psql sandcrawler < dump_crossref.sql \
+ | sed 's/\\"/\"/g' \
+ | rg '^\{' \
+ | pv -l \
+ | pigz \
+ > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz
+ => 26.1M 3:22:51 [2.15k/s]
+
+ # NOTE: -j40 is for production run with ~dedicated GROBID server with many cores
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz \
+ | rg -v "\\\\" \
+ | parallel -j35 --linebuffer --round-robin --pipe ./grobid_tool.py --grobid-host http://wbgrp-svc096.us.archive.org:8070 parse-crossref-refs - \
+ | pv -l \
+ | pigz \
+ > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz
+
+ # from earlier testing with -j40: able to do about 300-500 records/second
+ # 23.9k 0:01:14 [ 320 /s]
+ # 134518 total refs parsed
+ # ~1817 refs/second parsed
+
+ # with errors, got through about: 2.08M 1:38:20 [ 352 /s]
+ # was still seing bad JSON?
+ # JSON lines pushed: Counter({'total': 105898, 'pushed': 105886, 'error-json-decode': 12})
+
+ # finally, without errors:
+ # 18.6M 8:35:02 [ 603 /s]
+
+In the next step, going to need a small direct persist worker to copy lines
+verbatim into just the `grobid_refs` table.
+
+## Errors
+
+Got errors when running for real:
+
+ xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 114, column 33
+
+ requests.exceptions.HTTPError: 500 Server Error: Internal Server Error for url: http://wbgrp-svc096.us.archive.org:8070/api/processCitationList
+
+ urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='wbgrp-svc096.us.archive.org', port=8070): Max retries exceeded with url: /api/processCitationList (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f54b0a3bd00>: Failed to establish a new connection: [Errno 99] Cannot assign requested address'))
+
+
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ERROR [2021-11-03 06:57:32,569] org.grobid.service.process.GrobidRestProcessString: An unexpected exception occurs.
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! java.lang.NullPointerException: null
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.data.BiblioItem.cleanTitles(BiblioItem.java:1784)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingLayoutTokenMultiple(CitationParser.java:175)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingStringMultiple(CitationParser.java:92)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.Engine.processRawReferences(Engine.java:168)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.process.GrobidRestProcessString.processCitationList(GrobidRestProcessString.java:316)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.GrobidRestService.processCitationListReturnXml_post(GrobidRestService.java:581)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.GeneratedMethodAccessor19.invoke(Unknown Source)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at java.lang.reflect.Method.invoke(Method.java:498)
+ [...]
+
+Bogus example reference causing 500 error (among other non-error citations) (doi:10.5817/cz.muni.m210-9541-2019):
+
+ 'Müller, R., Šidák, P. (2012). Slovník novější literární teorie. Praha: Academia.'
+ '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0'
+ 'Šotkovská, J. (2008). Rané divadelní hry Milana Uhdeho; diplomová práce. Brno: Masarykova univerzita.',
+
+s.strip() in python would remove these non-breaking spaces (update: implemented this later)
+
+ Maheswari, S., Vijayalakshmi, C.: Optimization Model for Electricity Distribution System Control using Communication System by La-grangian Relaxation Technique. CiiT International Journal of Wireless Communication 3(3), 183–187 (2011) (Print: ISSN 0974 – 9756 & Online: ISSN 0974 – 9640)
+
+Also:
+
+ truncating very large reference list for doi:10.1017/chol9780521264303.033 len:2281
+ truncating very large reference list for doi:10.1017/chol9780521263351.011 len:3129
+ truncating very large reference list for doi:10.1017/chol9780521263351.022 len:2968
+ truncating very large reference list for doi:10.1017/chol9780521264303.036 len:2221
+ truncating very large reference list for doi:10.1017/chol9780521264303.007 len:2238
+ truncating very large reference list for doi:10.1017/chol9780521086912.001 len:2177
+ truncating very large reference list for doi:10.1017/chol9780521228046.002 len:2133
+ truncating very large reference list for doi:10.1017/chol9780521264303.035 len:2221
+ truncating very large reference list for doi:10.1017/chol9780521264303.002 len:2279
+
+Seems like bumping to 2500 as the maximum reference list size might be
+reasonable (it is 2000 currently).
+
+After some refactoring, still getting:
+
+ requests.exceptions.ConnectionError
+
+This is because I am doing POST without a session.
+
+Then, still got requests.exceptions.ReadTimeout
+
+Finally, got through the whole batch, (`18.6M 8:35:02 [ 603 /s]` output), with
+only a few dozen rows like:
+
+ GROBID returned bad XML for Crossref DOI: 10.1007/978-3-030-03008-7_21-1
+ GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1496-8_3
+ GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1493-7_3
+ GROBID returned bad XML for Crossref DOI: 10.1007/978-3-319-96184-2_2
+ GROBID returned bad XML for Crossref DOI: 10.1063/1.5031970
+ truncating very large reference list for doi:10.1007/978-1-4757-1499-9_15 len:11401
+ GROBID returned bad XML for Crossref DOI: 10.1016/j.oraloncology.2019.104562
+ GROBID returned bad XML for Crossref DOI: 10.1016/j.pec.2020.04.010
+
+So things seem to be working!
+
+Summary lines looked like:
+
+ JSON lines pushed: Counter({'total': 531487, 'pushed': 531487})
+ Worker: Counter({'total': 536541, 'failed': 3})
+
+Failures per batch were on the order of 0 to 3.
+
+## Postgres Backfill
+
+Start with a sample:
+
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \
+ | head -n1000 \
+ | ./persist_tool.py grobid-refs -
+ # Worker: Counter({'total': 1000, 'insert-grobid_refs': 1000, 'update-grobid_refs': 0})
+
+ # same command again:
+ # Worker: Counter({'total': 1000, 'update-grobid_refs': 1000, 'insert-grobid_refs': 0})
+
+Example DOIs:
+
+ # no refs
+ 10.1007/978-1-349-04135-0_3
+ http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-04135-0_3"
+
+ # with refs
+ 10.1007/978-1-349-03594-6_2
+ http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-03594-6_2"
+
+Seems to be working, so will do the full backfill. Can check table sizes on a
+per-table basis when complete.
+
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \
+ | pv -l \
+ | ./persist_tool.py grobid-refs -
+ # Worker: Counter({'total': 18646668, 'insert-grobid_refs': 18639195, 'update-grobid_refs': 7473})
+
+
+## Kafka Setup
+
+Added ansible config and deployed persist-crossref worker.
+
+First roll-back just a couple days as a test:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-11-07T00:00:00.000
+
+ # eg: Import counts: Counter({'total': 372350, 'insert-grobid_refs': 326987, 'update-crossref': 265581, 'insert-crossref': 106769, 'update-grobid_refs': 45362, 'skip': 1})
+
+Then roll-back to before the snapshot and backfill, to catch up:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-10-26T00:00:00.000
+
+Ran this last command on 2021-11-10, and total lag was around 2,566,741.
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
new file mode 100644
index 0000000..5fb69d1
--- /dev/null
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -0,0 +1,380 @@
+
+Want to test recent updates of GROBID (to fix regex issue), and also re-process
+a number of PDFs which failed to process with GROBID initially.
+
+
+## HTTP 503
+
+These are attempts which failed because GROBID was too busy or not running.
+
+ # IMPROVED BELOW
+ COPY (
+ SELECT row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.status_code = 503
+ AND cdx.sha1hex IS NOT NULL
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+ WITH NULL '';
+ # COPY 4749
+
+Not actually that many, which seems good. Confirm that these are uniq by sha1hex:
+
+ cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l
+ # 302
+
+Nope! Need to add "distinct on":
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.status_code = 503
+ AND cdx.sha1hex IS NOT NULL
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+ WITH NULL '';
+ # COPY 4297
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## Never Processed CDX
+
+PDFs in fatcat which have never been processed with GROBID.
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM fatcat_file
+ LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+ LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json'
+ WITH NULL '';
+ # COPY 15488
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+
+PDFs in fatcat which have never been processed with pdfextract.
+
+ # TODO
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM fatcat_file
+ LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+ LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+ WHERE
+ pdf_meta.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND cdx.mimetype = 'application/pdf'
+ AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json'
+ WITH NULL '';
+ # COPY 45535
+
+ cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+ # 45.5k 0:00:01 [30.2k/s]
+
+## Timeout or Failure
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+ WHERE
+ (grobid.status_code = 500 OR grobid.status_code = -4)
+ AND cdx.sha1hex IS NOT NULL
+ AND file_meta.mimetype = 'application/pdf'
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json'
+ WITH NULL '';
+ # COPY 8,084,296
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+This seems to not be working very well, mostly errors, empty docs, etc. Will
+roll-forward the kafka consumer group after attempting a couple hundred
+thousand of these.
+
+Let's try limiting to files actually in fatcat:
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ (grobid.status_code = 500 OR grobid.status_code = -4)
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ AND file_meta.mimetype = 'application/pdf'
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json'
+ WITH NULL '';
+ # COPY 529265
+
+That is a much more managable batch to retry.
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 529k 0:00:17 [31.0k/s]
+
+
+## Missing Fatcat Files
+
+There were around a half million fatcat file entities which didn't have `cdx`
+rows in sandcrawler. Did some specific pdfextract processing; now we should do
+GROBID ingest as well.
+
+Enque the `CDX` objects for GROBID and pdfextract processing:
+
+ zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 354k 0:00:11 [30.6k/s]
+
+ zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And some earlier files of interest on `aitio`:
+
+ cat files_missing_sha256.ingest_results.json \
+ | rg '"application/pdf"' \
+ | rg -v "\\\\" \
+ | jq .cdx -c \
+ | sort -u -S 4G \
+ | pv -l \
+ > files_missing_sha256.cdx.uniq.json
+ # 100k 0:00:47 [2.09k/s]
+
+ cat files_missing_sha256.cdx.uniq.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+ cat files_missing_sha256.cdx.uniq.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+
+## Ancient Fatcat Files
+
+Files from an era where we didn't record GROBID version or status, even for
+success.
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.status_code = 200
+ AND grobid.status IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json'
+ WITH NULL '';
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 107k 0:00:03 [29.9k/s]
+
+
+## Start Re-Processing Old GROBID Versions
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.status = 'success'
+ AND grobid.grobid_version NOT LIKE '0.7.%'
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json'
+ WITH NULL '';
+
+This one is huge, and want to process in batches/chunks of ~8 million at a time.
+
+ cd /srv/sandcrawler/tasks/
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \
+ | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json
+
+Submit individual batches like:
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Overall progress:
+
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_00.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_01.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_02.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_03.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_04.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_05.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_06.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_07.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small)
+
+This finally finished on 2022-04-26. Horray!
+
+## General Counts
+
+How many fatcat files of what mimetype (reported in sandcrawler-db)?
+
+ SELECT file_meta.mimetype, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY file_meta.mimetype
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+----------
+ application/pdf | 45227033
+ | 433068
+ application/octet-stream | 30634
+ application/jats+xml | 6874
+ text/html | 876
+ application/postscript | 199
+ application/gzip | 173
+ text/plain | 84
+ application/xml | 48
+ application/vnd.ms-powerpoint | 38
+ application/msword | 16
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 8
+ image/jpeg | 6
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 4
+ message/rfc822 | 4
+ application/zip | 4
+ text/x-tex | 3
+ application/x-dosexec | 3
+ application/x-tar | 2
+ application/vnd.ms-tnef | 2
+ image/svg+xml | 1
+ image/tiff | 1
+ image/png | 1
+ image/gif | 1
+ application/vnd.ms-office | 1
+ (25 rows)
+
+
+PDF extract status?
+
+ SELECT pdf_meta.status, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY pdf_meta.status
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ status | count
+ ----------------+----------
+ success | 43415920
+ | 2018522
+ text-too-large | 122730
+ parse-error | 94876
+ not-pdf | 32156
+ error-wayback | 14504
+ bad-unicode | 279
+ bad-pdf | 98
+ empty-blob | 2
+ (9 rows)
+
+
+What are the GROBID status codes for fatcat files? Narrowed down:
+
+ SELECT grobid.status, grobid.status_code, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY grobid.status, grobid.status_code
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ status | status_code | count
+ ----------------+-------------+----------
+ success | 200 | 44409069
+ error | 500 | 580402
+ | | 468836
+ | 200 | 240660
+ error-timeout | -4 | 79
+ bad-grobid-xml | 200 | 38
+ error | 200 | 3
+ (7 rows)
+
+Ran the same query again on 2021-12-15:
+
+ status | status_code | count
+ ----------------+-------------+----------
+ success | 200 | 45092915
+ error | 500 | 302373
+ | | 250335
+ | 200 | 53352
+ bad-grobid-xml | 200 | 39
+ error-timeout | -4 | 37
+ error | 200 | 34
+ error | 503 | 2
+ (8 rows)
diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
new file mode 100644
index 0000000..b5422c2
--- /dev/null
+++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
@@ -0,0 +1,23 @@
+
+Martin crawled more than 10 million new PDFs from various platform domains. We
+should get these processed and included in sandcrawler-db.
+
+## Select CDX Rows
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM cdx
+ LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
+ WITH NULL '';
+ => COPY 8801527
+
+ cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+ # for pdfextract, would be: sandcrawler-prod.unextracted
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md
new file mode 100644
index 0000000..c727a57
--- /dev/null
+++ b/notes/tasks/2022-03-07_ukraine_firedrill.md
@@ -0,0 +1,225 @@
+
+Want to do priority crawling of Ukranian web content, plus Russia and Belarus.
+
+
+## What is Missing?
+
+ (country_code:ua OR lang:uk)
+ => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA
+ later in day, already some 22k missing found! wow
+ => 2022-04-04, after ingests: 476,174 total, 131,063 missing, 49k OA missing
+
+## Metadata Prep
+
+- container metadata update (no code changes)
+ x wikidata SPARQL update
+ x chocula run
+ x journal metadata update (fatcat)
+ x update journal stats (fatcat extra)
+- DOAJ article metadata import
+ x prep and upload single JSON file
+
+
+## Journal Homepage URL Crawl
+
+x dump ukraine-related journal homepages from chocula DB
+x create crawl config
+x start crawl
+x repeat for belarus and russia
+
+
+ python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv
+
+sqlite3:
+
+ select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi
+ 1952
+
+ SELECT COUNT(*) FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+ => 1970
+
+ .mode csv
+ .once homepage_urls_ukraine.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+
+ .mode csv
+ .once homepage_urls_russia.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ru'
+ OR journal.lang = 'ru'
+ OR journal.name like '%russ%'
+ OR journal.publisher like '%russ%';
+
+ .mode csv
+ .once homepage_urls_belarus.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'by'
+ OR journal.lang = 'be'
+ OR journal.name like '%belarus%'
+ OR journal.publisher like '%belarus%';
+
+ cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+ 1971 homepage_urls_ukraine.tsv
+ 3482 homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv
+ 3728 homepage_urls_russia.tsv
+ 2420 homepage_urls.2022-03-08.ru_tld.tsv
+ 6030 homepage_urls_russia_combined.2022-03-08.tsv
+
+
+ cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv
+ 138 homepage_urls_belarus.tsv
+ 85 homepage_urls.2022-03-08.by_tld.tsv
+ 222 homepage_urls_belarus_combined.2022-03-08.tsv
+
+
+## Landing Page Crawl
+
+x create crawl config
+x fatcat ingest query for related URLs
+ => special request code/label?
+x finish .by and .ru article URL dump, start crawling
+x URL list filtered from new OAI-PMH feed
+ => do we need to do full bulk load/dump, or not?
+- URL list from partner (google)
+- do we need to do alternative thing of iterating over containers, ingesting each?
+
+ ./fatcat_ingest.py --env prod \
+ --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk"
+
+ # around Tue 08 Mar 2022 01:07:37 PM PST
+ # Expecting 185659 release objects in search queries
+ # didn't complete successfully? hrm
+
+ # ok, retry "manually" (with kafkacat)
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json
+ # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318})
+ # 103k 0:25:04 [68.7 /s]
+
+ zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz
+ # 103k 0:00:02 [38.1k/s]
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:by OR lang:be" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz
+ # Expecting 2266 release objects in search queries
+ # 1.29k 0:00:34 [37.5 /s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ru OR lang:ru" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz
+ # Expecting 1515246 release objects in search queries
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz
+
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt
+ # 309k 0:00:03 [81.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt
+ # 71.2k 0:00:03 [19.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt
+ # 276k 0:00:03 [72.9k/s]
+
+
+### Landing Page Bulk Ingest
+
+Running these 2022-03-24, after targeted crawl completed:
+
+ zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 103k 0:00:02 [36.1k/s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 1.29k 0:00:00 [15.8k/s]
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 546k 0:00:13 [40.6k/s]
+
+It will probably take a week or more for these to complete.
+
+
+## Outreach
+
+- openalex
+- sucho.org
+- ceeol.com
diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md
new file mode 100644
index 0000000..273ff32
--- /dev/null
+++ b/notes/tasks/2022-04-27_pdf_url_lists.md
@@ -0,0 +1,72 @@
+
+Another dump of PDF URLs for partners. This time want to provide TSV with full
+wayback download URLs, as well as "access" URLs.
+
+ export TASKDATE=2022-04-27
+
+## "Ingested", AKA, "Targetted" PDF URLs
+
+These are URLs where we did a successful ingest run.
+
+ COPY (
+ SELECT
+ terminal_sha1hex as pdf_sha1hex,
+ ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url,
+ ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ AND hit = true
+ ORDER BY terminal_sha1hex ASC
+ -- LIMIT 10;
+ )
+ TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv'
+ WITH NULL '';
+ => COPY 85712674
+
+May contain duplicates, both by sha1hex, URL, or both.
+
+Note that this could be filtered by timestamp, to make it monthly/annual.
+
+
+## All CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.sha1hex as pdf_sha1hex,
+ ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url,
+ ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ ORDER BY cdx.sha1hex ASC
+ -- LIMIT 10;
+ )
+ TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv'
+ WITH NULL '';
+ => COPY 161504070
+
+Should be unique by wayback URL; may contain near-duplicates or duplicates by
+
+## Upload to archive.org
+
+TODO: next time compress these files first (gzip/pigz)
+
+ia upload ia_scholarly_urls_$TASKDATE \
+ -m collection:ia_biblio_metadata \
+ -m title:"IA Scholarly URLs ($TASKDATE)" \
+ -m date:$TASKDATE \
+ -m creator:"Internet Archive Web Group" \
+ -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \
+ /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv
+
diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
new file mode 100644
index 0000000..74d3857
--- /dev/null
+++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
@@ -0,0 +1,132 @@
+
+Had a huge number of SPN requests for the andrzejklimczuk.com domain,
+presumably from the author.
+
+Many were duplicates (same file, multiple releases, often things like zenodo
+duplication). Many were also GROBID 500s, due to truncated common crawl
+captures.
+
+Needed to cleanup! Basically sorted through a few editgroups manually, then
+rejected all the rest and manually re-submitted with the below queries and
+commands:
+
+ SELECT COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
+ => 589
+
+ SELECT ingest_file_result.status, COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ GROUP BY ingest_file_result.status;
+
+ status | count
+ ----------------+-------
+ cdx-error | 1
+ success | 587
+ wrong-mimetype | 1
+ (3 rows)
+
+
+ SELECT grobid.status_code, COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ GROUP BY grobid.status_code;
+
+ status_code | count
+ -------------+-------
+ 200 | 385
+ 500 | 202
+ | 2
+ (3 rows)
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ AND ingest_file_result.status = 'success'
+ AND grobid.status_code = 500
+ ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
+ => COPY 202
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ AND ingest_file_result.status = 'success'
+ AND grobid.status_code = 200
+ ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
+ => COPY 385
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+ | jq '. + {force_recrawl: true}' -c \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+ | shuf \
+ | head -n100 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+ | shuf \
+ | head -n10000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
diff --git a/please b/please
index 4800112..74e9766 100755
--- a/please
+++ b/please
@@ -12,7 +12,7 @@ import subprocess
from datetime import datetime
HDFS_DIR = "hdfs:///user/bnewbold/sandcrawler"
-HBASE_HOST = "wbgrp-svc263.us.archive.org"
+HBASE_HOST = "wbgrp-svc350.us.archive.org"
ZOOKEEPER_HOSTS = "mtrcs-zk1.us.archive.org:2181"
GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070"
@@ -487,6 +487,23 @@ def run_dumpungrobided(args):
env=args.env)
subprocess.call(cmd, shell=True)
+def run_sbackfill(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting scalding backfill job...")
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.CdxBackfillJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --cdx-input-path {input_cdx}""".format(
+ input_cdx=args.input_cdx,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
def main():
parser = argparse.ArgumentParser()
@@ -506,6 +523,11 @@ def main():
sub_backfill.add_argument('input_cdx',
help="full HDFS path of CDX file to backfill")
+ sub_sbackfill = subparsers.add_parser('sbackfill')
+ sub_sbackfill.set_defaults(func=run_sbackfill)
+ sub_sbackfill.add_argument('input_cdx',
+ help="full HDFS path of CDX file to backfill")
+
sub_extract = subparsers.add_parser('extract')
sub_extract.set_defaults(func=run_extract)
sub_extract.add_argument('input_cdx',
diff --git a/sandcrawler-rfc.md b/proposals/2018_original_sandcrawler_rfc.md
index fea6a7c..ecf7ab8 100644
--- a/sandcrawler-rfc.md
+++ b/proposals/2018_original_sandcrawler_rfc.md
@@ -73,7 +73,7 @@ process HTML and look for PDF outlinks, but wouldn't crawl recursively.
HBase is used for de-dupe, with records (pointers) stored in WARCs.
A second config would take seeds as entire journal websites, and would crawl
-continously.
+continuously.
Other components of the system "push" tasks to the crawlers by copying schedule
files into the crawl action directories.
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
index c649809..768784f 100644
--- a/proposals/2019_ingest.md
+++ b/proposals/2019_ingest.md
@@ -1,5 +1,5 @@
-status: work-in-progress
+status: deployed
This document proposes structure and systems for ingesting (crawling) paper
PDFs and other content as part of sandcrawler.
@@ -84,7 +84,7 @@ HTML? Or both? Let's just recrawl.
*IngestRequest*
- `ingest_type`: required, one of `pdf`, `xml`, `html`, `dataset`. For
backwards compatibility, `file` should be interpreted as `pdf`. `pdf` and
- `xml` return file ingest respose; `html` and `dataset` not implemented but
+ `xml` return file ingest response; `html` and `dataset` not implemented but
would be webcapture (wayback) and fileset (archive.org item or wayback?).
In the future: `epub`, `video`, `git`, etc.
- `base_url`: required, where to start crawl process
@@ -258,7 +258,7 @@ and hacks to crawl publicly available papers. Related existing work includes
[unpaywall's crawler][unpaywall_crawl], LOCKSS extraction code, dissem.in's
efforts, zotero's bibliography extractor, etc. The "memento tracer" work is
also similar. Many of these are even in python! It would be great to reduce
-duplicated work and maintenance. An analagous system in the wild is youtube-dl
+duplicated work and maintenance. An analogous system in the wild is youtube-dl
for downloading video from many sources.
[unpaywall_crawl]: https://github.com/ourresearch/oadoi/blob/master/webpage.py
diff --git a/proposals/20200129_pdf_ingest.md b/proposals/20200129_pdf_ingest.md
index 9469217..157607e 100644
--- a/proposals/20200129_pdf_ingest.md
+++ b/proposals/20200129_pdf_ingest.md
@@ -1,5 +1,5 @@
-status: planned
+status: deployed
2020q1 Fulltext PDF Ingest Plan
===================================
@@ -27,7 +27,7 @@ There are a few million papers in fatacat which:
2. are known OA, usually because publication is Gold OA
3. don't have any fulltext PDF in fatcat
-As a detail, some of these "known OA" journals actually have embargos (aka,
+As a detail, some of these "known OA" journals actually have embargoes (aka,
they aren't true Gold OA). In particular, those marked via EZB OA "color", and
recent pubmed central ids.
@@ -104,7 +104,7 @@ Actions:
update ingest result table with status.
- fetch new MAG and unpaywall seedlists, transform to ingest requests, persist
into ingest request table. use SQL to dump only the *new* URLs (not seen in
- previous dumps) using the created timestamp, outputing new bulk ingest
+ previous dumps) using the created timestamp, outputting new bulk ingest
request lists. if possible, de-dupe between these two. then start bulk
heritrix crawls over these two long lists. Probably sharded over several
machines. Could also run serially (first one, then the other, with
@@ -133,7 +133,7 @@ We have run GROBID+glutton over basically all of these PDFs. We should be able
to do a SQL query to select PDFs that:
- have at least one known CDX row
-- GROBID processed successfuly and glutton matched to a fatcat release
+- GROBID processed successfully and glutton matched to a fatcat release
- do not have an existing fatcat file (based on sha1hex)
- output GROBID metadata, `file_meta`, and one or more CDX rows
@@ -161,7 +161,7 @@ Coding Tasks:
Actions:
- update `fatcat_file` sandcrawler table
-- check how many PDFs this might ammount to. both by uniq SHA1 and uniq
+- check how many PDFs this might amount to. both by uniq SHA1 and uniq
`fatcat_release` matches
- do some manual random QA verification to check that this method results in
quality content in fatcat
diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md
index 31a2db6..6f6443f 100644
--- a/proposals/20200207_pdftrio.md
+++ b/proposals/20200207_pdftrio.md
@@ -1,5 +1,8 @@
-status: in progress
+status: deployed
+
+NOTE: while this has been used in production, as of December 2022 the results
+are not used much in practice, and we don't score every PDF that comes along
PDF Trio (ML Classification)
==============================
diff --git a/proposals/20201012_no_capture.md b/proposals/20201012_no_capture.md
index bb47ea2..7f6a1f5 100644
--- a/proposals/20201012_no_capture.md
+++ b/proposals/20201012_no_capture.md
@@ -1,5 +1,8 @@
-status: in-progress
+status: work-in-progress
+
+NOTE: as of December 2022, bnewbold can't remember if this was fully
+implemented or not.
Storing no-capture missing URLs in `terminal_url`
=================================================
@@ -29,7 +32,7 @@ The current status quo is to store the missing URL as the last element in the
pipeline that would read from the Kafka feed and extract them, but this would
be messy. Eg, re-ingesting would not update the old kafka messages, so we could
need some accounting of consumer group offsets after which missing URLs are
-truely missing.
+truly missing.
We could add a new `missing_url` database column and field to the JSON schema,
for this specific use case. This seems like unnecessary extra work.
diff --git a/proposals/20201103_xml_ingest.md b/proposals/20201103_xml_ingest.md
index 181cc11..34e00b0 100644
--- a/proposals/20201103_xml_ingest.md
+++ b/proposals/20201103_xml_ingest.md
@@ -1,22 +1,5 @@
-status: wip
-
-TODO:
-x XML fulltext URL extractor (based on HTML biblio metadata, not PDF url extractor)
-x differential JATS XML and scielo XML from generic XML?
- application/xml+jats is what fatcat is doing for abstracts
- but it should be application/jats+xml?
- application/tei+xml
- if startswith "<article " and "<article-meta>" => JATS
-x refactor ingest worker to be more general
-x have ingest code publish body to kafka topic
-x write a persist worker
-/ create/configure kafka topic
-- test everything locally
-- fatcat: ingest tool to create requests
-- fatcat: entity updates worker creates XML ingest requests for specific sources
-- fatcat: ingest file import worker allows XML results
-- ansible: deployment of persist worker
+status: deployed
XML Fulltext Ingest
====================
@@ -37,7 +20,7 @@ document. For recording in fatcat, the file metadata will be passed through.
For storing in Kafka and blob store (for downstream analysis), we will parse
the raw XML document (as "bytes") with an XML parser, then re-output with UTF-8
encoding. The hash of the *original* XML file will be used as the key for
-refering to this document. This is unintuitive, but similar to what we are
+referring to this document. This is unintuitive, but similar to what we are
doing with PDF and HTML documents (extracting in a useful format, but keeping
the original document's hash as a key).
diff --git a/proposals/2020_pdf_meta_thumbnails.md b/proposals/2020_pdf_meta_thumbnails.md
index 793d6b5..141ece8 100644
--- a/proposals/2020_pdf_meta_thumbnails.md
+++ b/proposals/2020_pdf_meta_thumbnails.md
@@ -1,5 +1,5 @@
-status: work-in-progress
+status: deployed
New PDF derivatives: thumbnails, metadata, raw text
===================================================
@@ -133,7 +133,7 @@ Deployment will involve:
Plan for processing/catchup is:
- test with COVID-19 PDF corpus
-- run extraction on all current fatcat files avaiable via IA
+- run extraction on all current fatcat files available via IA
- integrate with ingest pipeline for all new files
- run a batch catchup job over all GROBID-parsed files with no pdf meta
extracted, on basis of SQL table query
diff --git a/proposals/2020_seaweed_s3.md b/proposals/2020_seaweed_s3.md
index 5f4ff0b..677393b 100644
--- a/proposals/2020_seaweed_s3.md
+++ b/proposals/2020_seaweed_s3.md
@@ -316,7 +316,7 @@ grows very much with the number of volumes. Therefore, keep default volume size
and do not limit number of volumes `-volume.max 0` and do not use in-memory
index (rather leveldb)
-Status: done, 200M object upload via Python script sucessfully in about 6 days,
+Status: done, 200M object upload via Python script successfully in about 6 days,
memory usage was at a moderate 400M (~10% of RAM). Relatively constant
performance at about 400 `PutObject` requests/s (over 5 threads, each thread
was around 80 requests/s; then testing with 4 threads, each thread got to
diff --git a/proposals/2021-04-22_crossref_db.md b/proposals/2021-04-22_crossref_db.md
new file mode 100644
index 0000000..1d4c3f8
--- /dev/null
+++ b/proposals/2021-04-22_crossref_db.md
@@ -0,0 +1,86 @@
+
+status: deployed
+
+Crossref DOI Metadata in Sandcrawler DB
+=======================================
+
+Proposal is to have a local copy of Crossref API metadata records in
+sandcrawler DB, accessible by simple key lookup via postgrest.
+
+Initial goal is to include these in scholar work "bundles" (along with
+fulltext, etc), in particular as part of reference extraction pipeline. Around
+late 2020, many additional references became available via Crossref records,
+and have not been imported (updated) into fatcat. Reference storage in fatcat
+API is a scaling problem we would like to put off, so injecting content in this
+way is desirable.
+
+To start, working with a bulk dump made available by Crossref. In the future,
+might persist the daily feed to that we have a continuously up-to-date copy.
+
+Another application of Crossref-in-bundles is to identify overall scale of
+changes since initial Crossref metadata import.
+
+
+## Sandcrawler DB Schema
+
+The "updated" field in this case refers to the upstream timestamp, not the
+sandcrawler database update time.
+
+ CREATE TABLE IF NOT EXISTS crossref (
+ doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)),
+ indexed TIMESTAMP WITH TIME ZONE NOT NULL,
+ record JSON NOT NULL,
+ PRIMARY KEY(doi)
+ );
+
+For postgrest access, may need to also:
+
+ GRANT SELECT ON public.crossref TO web_anon;
+
+## SQL Backfill Command
+
+For an example file:
+
+ cat sample.json \
+ | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \
+ | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');"
+
+For a full snapshot:
+
+ zcat crossref_public_data_file_2021_01.json.gz \
+ | pv -l \
+ | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \
+ | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');"
+
+jq is the bottleneck (100% of a single CPU core).
+
+## Kafka Worker
+
+Pulls from the fatcat crossref ingest Kafka feed and persists into the crossref
+table.
+
+## SQL Table Disk Utilization
+
+An example backfill from early 2021, with about 120 million Crossref DOI
+records.
+
+Starting database size (with ingest running):
+
+ Filesystem Size Used Avail Use% Mounted on
+ /dev/vdb1 1.7T 896G 818G 53% /1
+
+ Size: 475.14G
+
+Ingest SQL command took:
+
+ 120M 15:06:08 [2.22k/s]
+ COPY 120684688
+
+After database size:
+
+ Filesystem Size Used Avail Use% Mounted on
+ /dev/vdb1 1.7T 1.2T 498G 71% /1
+
+ Size: 794.88G
+
+So about 320 GByte of disk.
diff --git a/proposals/2021-09-09_component_ingest.md b/proposals/2021-09-09_component_ingest.md
new file mode 100644
index 0000000..09dee4f
--- /dev/null
+++ b/proposals/2021-09-09_component_ingest.md
@@ -0,0 +1,114 @@
+
+File Ingest Mode: 'component'
+=============================
+
+A new ingest type for downloading individual files which are a subset of a
+complete work.
+
+Some publishers now assign DOIs to individual figures, supplements, and other
+"components" of an over release or document.
+
+Initial mimetypes to allow:
+
+- image/jpeg
+- image/tiff
+- image/png
+- image/gif
+- audio/mpeg
+- video/mp4
+- video/mpeg
+- text/plain
+- text/csv
+- application/json
+- application/xml
+- application/pdf
+- application/gzip
+- application/x-bzip
+- application/x-bzip2
+- application/zip
+- application/x-rar
+- application/x-7z-compressed
+- application/x-tar
+- application/vnd.ms-powerpoint
+- application/vnd.ms-excel
+- application/msword
+- application/vnd.openxmlformats-officedocument.wordprocessingml.document
+- application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+
+Intentionally not supporting:
+
+- text/html
+
+
+## Fatcat Changes
+
+In the file importer, allow the additional mimetypes for 'component' ingest.
+
+
+## Ingest Changes
+
+Allow additional terminal mimetypes for 'component' crawls.
+
+
+## Examples
+
+Hundreds of thousands: <https://fatcat.wiki/release/search?q=type%3Acomponent+in_ia%3Afalse>
+
+#### ACS Supplement File
+
+<https://doi.org/10.1021/acscatal.0c02627.s002>
+
+Redirects directly to .zip in browser. SPN is blocked by cookie check.
+
+#### Frontiers .docx Supplement
+
+<https://doi.org/10.3389/fpls.2019.01642.s001>
+
+Redirects to full article page. There is a pop-up for figshare, seems hard to process.
+
+#### Figshare Single FIle
+
+<https://doi.org/10.6084/m9.figshare.13646972.v1>
+
+As 'component' type in fatcat.
+
+Redirects to a landing page. Dataset ingest seems more appropriate for this entire domain.
+
+#### PeerJ supplement file
+
+<https://doi.org/10.7717/peerj.10257/supp-7>
+
+PeerJ is hard because it redirects to a single HTML page, which has links to
+supplements in the HTML. Perhaps a custom extractor will work.
+
+#### eLife
+
+<https://doi.org/10.7554/elife.38407.010>
+
+The current crawl mechanism makes it seemingly impossible to extract a specific
+supplement from the document as a whole.
+
+#### Zookeys
+
+<https://doi.org/10.3897/zookeys.895.38576.figure53>
+
+These are extract-able.
+
+#### OECD PDF Supplement
+
+<https://doi.org/10.1787/f08c6324-en>
+<https://www.oecd-ilibrary.org/trade/imports-of-services-billions-of-us-dollars_f08c6324-en>
+
+Has an Excel (.xls) link, great, but then paywall.
+
+#### Direct File Link
+
+<https://doi.org/10.1787/888934207500>
+
+This one is also OECD, but is a simple direct download.
+
+#### Protein Data Base (PDB) Entry
+
+<https://doi.org/10.2210/pdb6ls2/pdb>
+
+Multiple files; dataset/fileset more appropriate for these.
diff --git a/proposals/2021-09-09_fileset_ingest.md b/proposals/2021-09-09_fileset_ingest.md
new file mode 100644
index 0000000..65c9ccf
--- /dev/null
+++ b/proposals/2021-09-09_fileset_ingest.md
@@ -0,0 +1,343 @@
+
+status: implemented
+
+Fileset Ingest Pipeline (for Datasets)
+======================================
+
+Sandcrawler currently has ingest support for individual files saved as `file`
+entities in fatcat (xml and pdf ingest types) and HTML files with
+sub-components saved as `webcapture` entities in fatcat (html ingest type).
+
+This document describes extensions to this ingest system to flexibly support
+groups of files, which may be represented in fatcat as `fileset` entities. The
+main new ingest type is `dataset`.
+
+Compared to the existing ingest process, there are two major complications with
+datasets:
+
+- the ingest process often requires more than parsing HTML files, and will be
+ specific to individual platforms and host software packages
+- the storage backend and fatcat entity type is flexible: a dataset might be
+ represented by a single file, multiple files combined in to a single .zip
+ file, or multiple separate files; the data may get archived in wayback or in
+ an archive.org item
+
+The new concepts of "strategy" and "platform" are introduced to accommodate
+these complications.
+
+
+## Ingest Strategies
+
+The ingest strategy describes the fatcat entity type that will be output; the
+storage backend used; and whether an enclosing file format is used. The
+strategy to use can not be determined until the number and size of files is
+known. It is a function of file count, total file size, and publication
+platform.
+
+Strategy names are compact strings with the format
+`{storage_backend}-{fatcat_entity}`. A `-bundled` suffix after a `fileset`
+entity type indicates that metadata about multiple files is retained, but that
+in the storage backend only a single enclosing file (eg, `.zip`) will be
+stored.
+
+The supported strategies are:
+
+- `web-file`: single file of any type, stored in wayback, represented as fatcat `file`
+- `web-fileset`: multiple files of any type, stored in wayback, represented as fatcat `fileset`
+- `web-fileset-bundled`: single bundle file, stored in wayback, represented as fatcat `fileset`
+- `archiveorg-file`: single file of any type, stored in archive.org item, represented as fatcat `file`
+- `archiveorg-fileset`: multiple files of any type, stored in archive.org item, represented as fatcat `fileset`
+- `archiveorg-fileset-bundled`: single bundle file, stored in archive.org item, represented as fatcat `fileset`
+
+"Bundle" or "enclosing" files are things like .zip or .tar.gz. Not all .zip
+files are handled as bundles! Only when the transfer from the hosting platform
+is via a "download all as .zip" (or similar) do we consider a zipfile a
+"bundle" and index the interior files as a fileset.
+
+The term "bundle file" is used over "archive file" or "container file" to
+prevent confusion with the other use of those terms in the context of fatcat
+(container entities; archive; Internet Archive as an organization).
+
+The motivation for supporting both `web` and `archiveorg` is that `web` is
+somewhat simpler for small files, but `archiveorg` is better for larger groups
+of files (say more than 20) and larger total size (say more than 1 GByte total,
+or 128 MByte for any one file).
+
+The motivation for supporting "bundled" filesets is that there is only a single
+file to archive.
+
+
+## Ingest Pseudocode
+
+1. Determine `platform`, which may involve resolving redirects and crawling a landing page.
+
+ a. currently we always crawl the ingest `base_url`, capturing a platform landing page
+ b. we don't currently handle the case of `base_url` leading to a non-HTML
+ terminal resource. the `component` ingest type does handle this
+
+2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
+
+ a. depending on platform, may include access URLs for multiple strategies
+ (eg, URL for each file and a bundle URL), metadata about the item for, eg,
+ archive.org item upload, etc
+
+3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
+
+4. Summarize status and return structured result metadata.
+
+ a. if the strategy was `web-file` or `archiveorg-file`, potentially submit an
+ `ingest_file_result` object down the file ingest pipeline (Kafka topic and
+ later persist and fatcat import workers), with `dataset-file` ingest
+ type (or `{ingest_type}-file` more generally).
+
+New python types:
+
+ FilesetManifestFile
+ path: str
+ size: Optional[int]
+ md5: Optional[str]
+ sha1: Optional[str]
+ sha256: Optional[str]
+ mimetype: Optional[str]
+ extra: Optional[Dict[str, Any]]
+
+ status: Optional[str]
+ platform_url: Optional[str]
+ terminal_url: Optional[str]
+ terminal_dt: Optional[str]
+
+ FilesetPlatformItem
+ platform_name: str
+ platform_status: str
+ platform_domain: Optional[str]
+ platform_id: Optional[str]
+ manifest: Optional[List[FilesetManifestFile]]
+ archiveorg_item_name: Optional[str]
+ archiveorg_item_meta
+ web_base_url
+ web_bundle_url
+
+ ArchiveStrategyResult
+ ingest_strategy: str
+ status: str
+ manifest: List[FilesetManifestFile]
+ file_file_meta: Optional[dict]
+ file_terminal: Optional[dict]
+ file_cdx: Optional[dict]
+ bundle_file_meta: Optional[dict]
+ bundle_terminal: Optional[dict]
+ bundle_cdx: Optional[dict]
+ bundle_archiveorg_path: Optional[dict]
+
+New python APIs/classes:
+
+ FilesetPlatformHelper
+ match_request(request, resource, html_biblio) -> bool
+ does the request and landing page metadata indicate a match for this platform?
+ process_request(request, resource, html_biblio) -> FilesetPlatformItem
+ do API requests, parsing, etc to fetch metadata and access URLs for this fileset/dataset. platform-specific
+ chose_strategy(item: FilesetPlatformItem) -> IngestStrategy
+ select an archive strategy for the given fileset/dataset
+
+ FilesetIngestStrategy
+ check_existing(item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]
+ check the given backend for an existing capture/archive; if found, return result
+ process(item: FilesetPlatformItem) -> ArchiveStrategyResult
+ perform an actual archival capture
+
+## Limits and Failure Modes
+
+- `too-large-size`: total size of the fileset is too large for archiving.
+ initial limit is 64 GBytes, controlled by `max_total_size` parameter.
+- `too-many-files`: number of files (and thus file-level metadata) is too
+ large. initial limit is 200, controlled by `max_file_count` parameter.
+- `platform-scope / FilesetPlatformScopeError`: for when `base_url` leads to a
+ valid platform, which could be found via API or parsing, but has the wrong
+ scope. Eg, tried to fetch a dataset, but got a DOI which represents all
+ versions of the dataset, not a specific version.
+- `platform-restricted`/`PlatformRestrictedError`: for, eg, embargoes
+- `platform-404`: got to a landing page, and seemed like in-scope, but no
+ platform record found anyways
+
+
+## New Sandcrawler Code and Worker
+
+ sandcrawler-ingest-fileset-worker@{1..6} (or up to 1..12 later)
+
+Worker consumes from ingest request topic, produces to fileset ingest results,
+and optionally produces to file ingest results.
+
+ sandcrawler-persist-ingest-fileset-worker@1
+
+Simply writes fileset ingest rows to SQL.
+
+
+## New Fatcat Worker and Code Changes
+
+ fatcat-import-ingest-fileset-worker
+
+This importer is modeled on file and web worker. Filters for `success` with
+strategy of `*-fileset*`.
+
+Existing `fatcat-import-ingest-file-worker` should be updated to allow
+`dataset` single-file imports, with largely same behavior and semantics as
+current importer (`component` mode).
+
+Existing fatcat transforms, and possibly even elasticsearch schemas, should be
+updated to include fileset status and `in_ia` flag for dataset type releases.
+
+Existing entity updates worker submits `dataset` type ingests to ingest request
+topic.
+
+
+## Ingest Result Schema
+
+Common with file results, and mostly relating to landing page HTML:
+
+ hit: bool
+ status: str
+ success
+ success-existing
+ success-file (for `web-file` or `archiveorg-file` only)
+ request: object
+ terminal: object
+ file_meta: object
+ cdx: object
+ revisit_cdx: object
+ html_biblio: object
+
+Additional fileset-specific fields:
+
+ manifest: list of objects
+ platform_name: str
+ platform_domain: str
+ platform_id: str
+ platform_base_url: str
+ ingest_strategy: str
+ archiveorg_item_name: str (optional, only for `archiveorg-*` strategies)
+ file_count: int
+ total_size: int
+ fileset_bundle (optional, only for `*-fileset-bundle` strategy)
+ file_meta
+ cdx
+ revisit_cdx
+ terminal
+ archiveorg_bundle_path
+ fileset_file (optional, only for `*-file` strategy)
+ file_meta
+ terminal
+ cdx
+ revisit_cdx
+
+If the strategy was `web-file` or `archiveorg-file` and the status is
+`success-file`, then an ingest file result will also be published to
+`sandcrawler-ENV.ingest-file-results`, using the same ingest type and fields as
+regular ingest.
+
+
+All fileset ingest results get published to ingest-fileset-result.
+
+Existing sandcrawler persist workers also subscribe to this topic and persist
+status and landing page terminal info to tables just like with file ingest.
+GROBID, HTML, and other metadata is not persisted in this path.
+
+If the ingest strategy was a single file (`*-file`), then an ingest file is
+also published to the ingest-file-result topic, with the `fileset_file`
+metadata, and ingest type `dataset-file`. This should only happen on success
+condition.
+
+
+## New SQL Tables
+
+Note that this table *complements* `ingest_file_result`, doesn't replace it.
+`ingest_file_result` could more accurately be called `ingest_result`.
+
+ CREATE TABLE IF NOT EXISTS ingest_fileset_platform (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+
+ platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1),
+ platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1),
+ platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1),
+ ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1),
+ total_size BIGINT,
+ file_count BIGINT,
+ archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1),
+
+ archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1),
+ web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1),
+ web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14),
+
+ manifest JSONB,
+ -- list, similar to fatcat fileset manifest, plus extra:
+ -- status (str)
+ -- path (str)
+ -- size (int)
+ -- md5 (str)
+ -- sha1 (str)
+ -- sha256 (str)
+ -- mimetype (str)
+ -- extra (dict)
+ -- platform_url (str)
+ -- terminal_url (str)
+ -- terminal_dt (str)
+
+ PRIMARY KEY (ingest_type, base_url)
+ );
+ CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id);
+
+Persist worker should only insert in to this table if `platform_name` is
+identified.
+
+## New Kafka Topic
+
+ sandcrawler-ENV.ingest-fileset-results 6x, no retention limit
+
+
+## Implementation Plan
+
+First implement ingest worker, including platform and strategy helpers, and
+test those as simple stdin/stdout CLI tools in sandcrawler repo to validate
+this proposal.
+
+Second implement fatcat importer and test locally and/or in QA.
+
+Lastly implement infrastructure, automation, and other "glue":
+
+- SQL schema
+- persist worker
+
+
+## Design Note: Single-File Datasets
+
+Should datasets and other groups of files which only contain a single file get
+imported as a fatcat `file` or `fileset`? This can be broken down further as
+documents (single PDF) vs other individual files.
+
+Advantages of `file`:
+
+- handles case of article PDFs being marked as dataset accidentally
+- `file` entities get de-duplicated with simple lookup (eg, on `sha1`)
+- conceptually simpler if individual files are `file` entity
+- easier to download individual files
+
+Advantages of `fileset`:
+
+- conceptually simpler if all `dataset` entities have `fileset` form factor
+- code path is simpler: one fewer strategy, and less complexity of sending
+ files down separate import path
+- metadata about platform is retained
+- would require no modification of existing fatcat file importer
+- fatcat import of archive.org of `file` is not actually implemented yet?
+
+Decision is to do individual files. Fatcat fileset import worker should reject
+single-file (and empty) manifest filesets. Fatcat file import worker should
+accept all mimetypes for `dataset-file` (similar to `component`).
+
+
+## Example Entities
+
+See `notes/dataset_examples.txt`
diff --git a/proposals/2021-09-13_src_ingest.md b/proposals/2021-09-13_src_ingest.md
new file mode 100644
index 0000000..470827a
--- /dev/null
+++ b/proposals/2021-09-13_src_ingest.md
@@ -0,0 +1,53 @@
+
+File Ingest Mode: 'src'
+=======================
+
+Ingest type for "source" of works in document form. For example, tarballs of
+LaTeX source and figures, as published on arxiv.org and Pubmed Central.
+
+For now, presumption is that this would be a single file (`file` entity in
+fatcat).
+
+Initial mimetypes to allow:
+
+- text/x-tex
+- application/xml
+- application/gzip
+- application/x-bzip
+- application/x-bzip2
+- application/zip
+- application/x-tar
+- application/msword
+- application/vnd.openxmlformats-officedocument.wordprocessingml.document
+
+
+## Fatcat Changes
+
+In the file importer, allow the additional mimetypes for 'src' ingest.
+
+Might keep ingest disabled on the fatcat side, at least initially. Eg, until
+there is some scope of "file scope", or other ways of treating 'src' tarballs
+separate from PDFs or other fulltext formats.
+
+
+## Ingest Changes
+
+Allow additional terminal mimetypes for 'src' crawls.
+
+
+## Examples
+
+ arxiv:2109.00954v1
+ fatcat:release_akzp2lgqjbcbhpoeoitsj5k5hy
+ https://arxiv.org/format/2109.00954v1
+ https://arxiv.org/e-print/2109.00954v1
+
+ arxiv:1912.03397v2
+ https://arxiv.org/format/1912.03397v2
+ https://arxiv.org/e-print/1912.03397v2
+ NOT: https://arxiv.org/pdf/1912.03397v2
+
+ pmcid:PMC3767916
+ https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/03/PMC3767916.tar.gz
+
+For PMC, will need to use one of the .csv file lists to get the digit prefixes.
diff --git a/proposals/2021-09-21_spn_accounts.md b/proposals/2021-09-21_spn_accounts.md
new file mode 100644
index 0000000..e41c162
--- /dev/null
+++ b/proposals/2021-09-21_spn_accounts.md
@@ -0,0 +1,14 @@
+
+Formalization of SPNv2 API requests from fatcat/sandcrawler
+
+Create two new system accounts, one for regular/daily ingest requests, one for
+priority requests (save-paper-now or as a flag with things like fatcat-ingest;
+"interactive"). These accounts should have @archive.org emails. Request the
+daily one to have the current rate limit as bnewbold@archive.org account; the
+priority queue can have less.
+
+Create new ingest kafka queues from scratch, one for priority and one for
+regular. Chose sizes carefully, probably keep 24x for the regular and do 6x or
+so (small) for priority queue.
+
+Deploy new priority workers; reconfigure/deploy broadly.
diff --git a/proposals/2021-10-28_grobid_refs.md b/proposals/2021-10-28_grobid_refs.md
new file mode 100644
index 0000000..1fc79b6
--- /dev/null
+++ b/proposals/2021-10-28_grobid_refs.md
@@ -0,0 +1,125 @@
+
+GROBID References in Sandcrawler DB
+===================================
+
+Want to start processing "unstructured" raw references coming from upstream
+metadata sources (distinct from upstream fulltext sources, like PDFs or JATS
+XML), and save the results in sandcrawler DB. From there, they will get pulled
+in to fatcat-scholar "intermediate bundles" and included in reference exports.
+
+The initial use case for this is to parse "unstructured" references deposited
+in Crossref, and include them in refcat.
+
+
+## Schema and Semantics
+
+The output JSON/dict schema for parsed references follows that of
+`grobid_tei_xml` version 0.1.x, for the `GrobidBiblio` field. The
+`unstructured` field that was parsed is included in the output, though it may
+not be byte-for-byte exact (see below). One notable change from the past (eg,
+older GROBID-parsed references) is that author `name` is now `full_name`. New
+fields include `editors` (same schema as `authors`), `book_title`, and
+`series_title`.
+
+The overall output schema matches that of the `grobid_refs` SQL table:
+
+ source: string, lower-case. eg 'crossref'
+ source_id: string, eg '10.1145/3366650.3366668'
+ source_ts: optional timestamp (full ISO datetime with timezone (eg, `Z`
+ suffix), which identifies version of upstream metadata
+ refs_json: JSON, list of `GrobidBiblio` JSON objects
+
+References are re-processed on a per-article (or per-release) basis. All the
+references for an article are handled as a batch and output as a batch. If
+there are no upstream references, row with `ref_json` as empty list may be
+returned.
+
+Not all upstream references get re-parsed, even if an 'unstructured' field is
+available. If 'unstructured' is not available, no row is ever output. For
+example, if a reference includes `unstructured` (raw citation string), but also
+has structured metadata for authors, title, year, and journal name, we might
+not re-parse the `unstructured` string. Whether to re-parse is evaulated on a
+per-reference basis. This behavior may change over time.
+
+`unstructured` strings may be pre-processed before being submitted to GROBID.
+This is because many sources have systemic encoding issues. GROBID itself may
+also do some modification of the input citation string before returning it in
+the output. This means the `unstructured` string is not a reliable way to map
+between specific upstream references and parsed references. Instead, the `id`
+field (str) of `GrobidBiblio` gets set to any upstream "key" or "index"
+identifier used to track individual references. If there is only a numeric
+index, the `id` is that number as a string.
+
+The `key` or `id` may need to be woven back in to the ref objects manually,
+because GROBID `processCitationList` takes just a list of raw strings, with no
+attached reference-level key or id.
+
+
+## New SQL Table and View
+
+We may want to do re-parsing of references from sources other than `crossref`,
+so there is a generic `grobid_refs` table. But it is also common to fetch both
+the crossref metadata and any re-parsed references together, so as a convenience
+there is a PostgreSQL view (virtual table) that includes both a crossref
+metadata record and parsed citations, if available. If downstream code cares a
+lot about having the refs and record be in sync, the `source_ts` field on
+`grobid_refs` can be matched against the `indexed` column of `crossref` (or the
+`.indexed.date-time` JSON field in the record itself).
+
+Remember that DOIs should always be lower-cased before querying, inserting,
+comparing, etc.
+
+ CREATE TABLE IF NOT EXISTS grobid_refs (
+ source TEXT NOT NULL CHECK (octet_length(source) >= 1),
+ source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1),
+ source_ts TIMESTAMP WITH TIME ZONE,
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ refs_json JSON NOT NULL,
+ PRIMARY KEY(source, source_id)
+ );
+
+ CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS
+ SELECT
+ crossref.doi as doi,
+ crossref.indexed as indexed,
+ crossref.record as record,
+ grobid_refs.source_ts as source_ts,
+ grobid_refs.refs_json as refs_json
+ FROM crossref
+ LEFT JOIN grobid_refs ON
+ grobid_refs.source_id = crossref.doi
+ AND grobid_refs.source = 'crossref';
+
+Both `grobid_refs` and `crossref_with_refs` will be exposed through postgrest.
+
+
+## New Workers / Tools
+
+For simplicity, to start, a single worker with consume from
+`fatcat-prod.api-crossref`, process citations with GROBID (if necessary), and
+insert to both `crossref` and `grobid_refs` tables. This worker will run
+locally on the machine with sandcrawler-db.
+
+Another tool will support taking large chunks of Crossref JSON (as lines),
+filter them, process with GROBID, and print JSON to stdout, in the
+`grobid_refs` JSON schema.
+
+
+## Task Examples
+
+Command to process crossref records with refs tool:
+
+ cat crossref_sample.json \
+ | parallel -j5 --linebuffer --round-robin --pipe ./grobid_tool.py parse-crossref-refs - \
+ | pv -l \
+ > crossref_sample.parsed.json
+
+ # => 10.0k 0:00:27 [ 368 /s]
+
+Load directly in to postgres (after tables have been created):
+
+ cat crossref_sample.parsed.json \
+ | jq -rc '[.source, .source_id, .source_ts, (.refs_json | tostring)] | @tsv' \
+ | psql sandcrawler -c "COPY grobid_refs (source, source_id, source_ts, refs_json) FROM STDIN (DELIMITER E'\t');"
+
+ # => COPY 9999
diff --git a/proposals/2021-12-09_trawling.md b/proposals/2021-12-09_trawling.md
new file mode 100644
index 0000000..33b6b4c
--- /dev/null
+++ b/proposals/2021-12-09_trawling.md
@@ -0,0 +1,180 @@
+
+status: work-in-progress
+
+NOTE: as of December 2022, the implementation on these features haven't been
+merged to the main branch. Development stalled in December 2021.
+
+Trawling for Unstructured Scholarly Web Content
+===============================================
+
+## Background and Motivation
+
+A long-term goal for sandcrawler has been the ability to pick through
+unstructured web archive content (or even non-web collection), identify
+potential in-scope research outputs, extract metadata for those outputs, and
+merge the content in to a catalog (fatcat).
+
+This process requires integration of many existing tools (HTML and PDF
+extraction; fuzzy bibliographic metadata matching; machine learning to identify
+in-scope content; etc), as well as high-level curration, targetting, and
+evaluation by human operators. The goal is to augment and improve the
+productivity of human operators as much as possible.
+
+This process will be similar to "ingest", which is where we start with a
+specific URL and have some additional context about the expected result (eg,
+content type, exernal identifier). Some differences with trawling are that we
+are start with a collection or context (instead of single URL); have little or
+no context about the content we are looking for; and may even be creating a new
+catalog entry, as opposed to matching to a known existing entry.
+
+
+## Architecture
+
+The core operation is to take a resource and run a flowchart of processing
+steps on it, resulting in an overall status and possible related metadata. The
+common case is that the resource is a PDF or HTML coming from wayback (with
+contextual metadata about the capture), but we should be flexible to supporting
+more content types in the future, and should try to support plain files with no
+context as well.
+
+Some relatively simple wrapper code handles fetching resources and summarizing
+status/counts.
+
+Outside of the scope of sandcrawler, new fatcat code (importer or similar) will
+be needed to handle trawl results. It will probably make sense to pre-filter
+(with `jq` or `rg`) before passing results to fatcat.
+
+At this stage, trawl workers will probably be run manually. Some successful
+outputs (like GROBID, HTML metadata) would be written to existing kafka topics
+to be persisted, but there would not be any specific `trawl` SQL tables or
+automation.
+
+It will probably be helpful to have some kind of wrapper script that can run
+sandcrawler trawl processes, then filter and pipe the output into fatcat
+importer, all from a single invocation, while reporting results.
+
+TODO:
+- for HTML imports, do we fetch the full webcapture stuff and return that?
+
+
+## Methods of Operation
+
+### `cdx_file`
+
+An existing CDX file is provided on-disk locally.
+
+### `cdx_api`
+
+Simplified variants: `cdx_domain`, `cdx_surt`
+
+Uses CDX API to download records matching the configured filters, then processes the file.
+
+Saves the CDX file intermediate result somewhere locally (working or tmp
+directory), with timestamp in the path, to make re-trying with `cdx_file` fast
+and easy.
+
+
+### `archiveorg_web_collection`
+
+Uses `cdx_collection.py` (or similar) to fetch a full CDX list, by iterating over
+then process it.
+
+Saves the CDX file intermediate result somewhere locally (working or tmp
+directory), with timestamp in the path, to make re-trying with `cdx_file` fast
+and easy.
+
+### Others
+
+- `archiveorg_file_collection`: fetch file list via archive.org metadata, then processes each
+
+## Schema
+
+Per-resource results:
+
+ hit (bool)
+ indicates whether resource seems in scope and was processed successfully
+ (roughly, status 'success', and
+ status (str)
+ success: fetched resource, ran processing, pa
+ skip-cdx: filtered before even fetching resource
+ skip-resource: filtered after fetching resource
+ wayback-error (etc): problem fetching
+ content_scope (str)
+ filtered-{filtertype}
+ article (etc)
+ landing-page
+ resource_type (str)
+ pdf, html
+ file_meta{}
+ cdx{}
+ revisit_cdx{}
+
+ # below are resource_type specific
+ grobid
+ pdf_meta
+ pdf_trio
+ html_biblio
+ (other heuristics and ML)
+
+High-level request:
+
+ trawl_method: str
+ cdx_file_path
+ default_filters: bool
+ resource_filters[]
+ scope: str
+ surt_prefix, domain, host, mimetype, size, datetime, resource_type, http_status
+ value: any
+ values[]: any
+ min: any
+ max: any
+ biblio_context{}: set of expected/default values
+ container_id
+ release_type
+ release_stage
+ url_rel
+
+High-level summary / results:
+
+ status
+ request{}: the entire request object
+ counts
+ total_resources
+ status{}
+ content_scope{}
+ resource_type{}
+
+## Example Corpuses
+
+All PDFs (`application/pdf`) in web.archive.org from before the year 2000.
+Starting point would be a CDX list.
+
+Spidering crawls starting from a set of OA journal homepage URLs.
+
+Archive-It partner collections from research universities, particularly of
+their own .edu domains. Starting point would be an archive.org collection, from
+which WARC files or CDX lists can be accessed.
+
+General archive.org PDF collections, such as
+[ERIC](https://archive.org/details/ericarchive) or
+[Document Cloud](https://archive.org/details/documentcloud).
+
+Specific Journal or Publisher URL patterns. Starting point could be a domain,
+hostname, SURT prefix, and/or URL regex.
+
+Heuristic patterns over full web.archive.org CDX index. For example, .edu
+domains with user directories and a `.pdf` in the file path ("tilde" username
+pattern).
+
+Random samples of entire Wayback corpus. For example, random samples filtered
+by date, content type, TLD, etc. This would be true "trawling" over the entire
+corpus.
+
+
+## Other Ideas
+
+Could have a web archive spidering mode: starting from a seed, fetch multiple
+captures (different captures), then extract outlinks from those, up to some
+number of hops. An example application would be links to research group
+webpages or author homepages, and to try to extract PDF links from CVs, etc.
+
diff --git a/proposals/brainstorm/2021-debug_web_interface.md b/proposals/brainstorm/2021-debug_web_interface.md
new file mode 100644
index 0000000..442b439
--- /dev/null
+++ b/proposals/brainstorm/2021-debug_web_interface.md
@@ -0,0 +1,9 @@
+
+status: brainstorm idea
+
+Simple internal-only web interface to help debug ingest issues.
+
+- paste a hash, URL, or identifier and get a display of "everything we know" about it
+- enter a URL/SURT prefix and get aggregate stats (?)
+- enter a domain/host/prefix and get recent attempts/results
+- pre-computed periodic reports on ingest pipeline (?)
diff --git a/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md b/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md
new file mode 100644
index 0000000..b3ad447
--- /dev/null
+++ b/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md
@@ -0,0 +1,36 @@
+
+status: brainstorming
+
+We continue to see issues with heritrix3-based crawling. Would like to have an
+option to switch to higher-throughput heritrix-based crawling.
+
+SPNv2 path would stick around at least for save-paper-now style ingest.
+
+
+## Sketch
+
+Ingest requests are created continuously by fatcat, with daily spikes.
+
+Ingest workers run mostly in "bulk" mode, aka they don't make SPNv2 calls.
+`no-capture` responses are recorded in sandcrawler SQL database.
+
+Periodically (daily?), a script queries for new no-capture results, filtered to
+the most recent period. These are processed in a bit in to a URL list, then
+converted to a heritrix frontier, and sent to crawlers. This could either be an
+h3 instance (?), or simple `scp` to a running crawl directory.
+
+The crawler crawls, with usual landing page config, and draintasker runs.
+
+TODO: can we have draintasker/heritrix set a maximum WARC life? Like 6 hours?
+or, target a smaller draintasker item size, so they get updated more frequently
+
+Another SQL script dumps ingest requests from the *previous* period, and
+re-submits them for bulk-style ingest (by workers).
+
+The end result would be things getting crawled and updated within a couple
+days.
+
+
+## Sketch 2
+
+Upload URL list to petabox item, wait for heritrix derive to run (!)
diff --git a/python/.coveragerc b/python/.coveragerc
index 67053a7..51038d6 100644
--- a/python/.coveragerc
+++ b/python/.coveragerc
@@ -2,4 +2,3 @@
omit = tests/*
source =
sandcrawler
- grobid2json
diff --git a/python/.flake8 b/python/.flake8
new file mode 100644
index 0000000..c7ef5fe
--- /dev/null
+++ b/python/.flake8
@@ -0,0 +1,21 @@
+[flake8]
+select = C,E,F,W,ANN
+# ANN003 is annotation on, eg, **kwargs
+# ANN101 is annotation on 'self' (why would that be wanted?)
+# ANN204 is annotation on '__init__()'
+# ANN401 is 'Any' type
+# E265,E266 are restrictions on comments ('#')
+# E501 is line-too-long, which we enforce with black
+# W503,E203 are allowed by black
+# TODO: C901 is complexity, should be re-enabled at some point
+ignore = ANN003,ANN101,ANN204,ANN401,E265,E266,E501,C901,W503,E203
+per-file-ignores =
+ sandcrawler/__init__.py: F401
+ sandcrawler/ia.py: E402
+ tests/*.py: ANN201,ANN001,F403,F405
+ # TODO: add more annotations to CLI scripts
+ *_tool.py,sandcrawler_worker.py: ANN201,ANN001,ANN202,ANN206,ANN205,F403,F405
+ scripts: ANN201,ANN001,ANN202,ANN206,ANN205
+exclude = .git,__pycache__,.venv,scripts/
+max-line-length = 96
+max-complexity = 30
diff --git a/python/.gitignore b/python/.gitignore
index 9b0fdb2..a5a773e 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,4 +1,14 @@
*part-000*
*.tar.gz
-*.tsv.gz
+*.gz
htmlcov/
+samples/
+*.json
+TODO*
+*.tsv
+
+!.flake8
+!.gitlab-ci.yml
+!.pylintrc
+!.coveragerc
+!.gitignore
diff --git a/python/Makefile b/python/Makefile
index 9593ab4..940a7eb 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -14,14 +14,14 @@ deps: ## Install dependencies using pipenv
.PHONY: lint
lint: ## Run lints (eg, flake8, mypy)
- #pipenv run flake8 . --exit-zero
- pipenv run flake8 . --select=E9,F63,F7,F82 --exit-zero
+ pipenv run flake8 . --exit-zero
+ pipenv run isort -q -c . || true
pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports
- pipenv run pylint --rcfile=.pylintrc -E --jobs=4 sandcrawler tests *.py
.PHONY: fmt
fmt: ## Run code formating on all source code
- pipenv run black *.py sandcrawler/ tests/
+ pipenv run isort --atomic .
+ pipenv run black --line-length 96 sandcrawler/ tests/ scripts/ *.py
.PHONY: test
test: ## Run all tests and lints
diff --git a/python/Pipfile b/python/Pipfile
index bb3f180..b841755 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -1,6 +1,6 @@
[[source]]
name = "ia"
-url = "https://devpi.archive.org/wb/prod"
+url = "https://devpi.us.archive.org/wb/prod"
verify_ssl = true
[[source]]
@@ -20,38 +20,41 @@ ipython = "*"
mypy = "*"
flake8 = "*"
flake8-annotations = "*"
-
-# pytype is failing to install on xenial VMs
-#pytype = "*"
+isort = "*"
+types-requests = "*"
+types-beautifulsoup4 = "*"
+types-dateparser = "*"
+types-psycopg2 = "*"
+types-Pillow = "*"
+black = "*"
[packages]
requests = ">=2"
-raven = {extras = ['flask'],version = "*"}
confluent-kafka = "*"
python-snappy = "*"
boto3 = "*"
-minio = "*"
+minio = "<7.0.0"
psycopg2 = "*"
bs4 = "*"
python-magic = "*"
ftfy = "*"
internetarchive = "*"
-Flask = ">=1"
urlcanon = "*"
-pillow = ">=3"
+Pillow = ">=3"
python-poppler = ">=0.2.1"
selectolax = ">=0.2"
-trafilatura = "*"
+# constraining trafilatura to prevent a version conflict with
+# `charset_normalizer`, between htmldate and requests
+trafilatura = ">=1,<1.4"
+htmldate= ">=1,<1.4"
pydantic = ">=1.7"
dateparser = "*"
braveblock = "*"
dynaconf = ">=3"
sentry-sdk = { version = ">=0.14.0", extras = [] }
zstandard = "*"
-
-# must lock black to an exact version because it is still "beta"
-# see: https://github.com/psf/black/issues/517
-black = "==19.10b0"
+grobid_tei_xml = ">=0.1.2,<0.2.0"
+PyMuPDF = ">=1.19.0,<1.20.0"
[requires]
python_version = "3.8"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 3551b54..546a420 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "88f69c9f6bcc19909238c16af9cb7101959139fc1b6012af31f17900c89e3aea"
+ "sha256": "35d0f0cd2f3903cce19d5a73f50a89ba09a1b43abbda84894fd45411d7f32760"
},
"pipfile-spec": 6,
"requires": {
@@ -10,7 +10,7 @@
"sources": [
{
"name": "ia",
- "url": "https://devpi.archive.org/wb/prod",
+ "url": "https://devpi.us.archive.org/wb/prod",
"verify_ssl": true
},
{
@@ -21,113 +21,161 @@
]
},
"default": {
- "appdirs": {
- "hashes": [
- "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41",
- "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"
- ],
- "version": "==1.4.4"
- },
- "attrs": {
- "hashes": [
- "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6",
- "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"
- ],
- "version": "==20.3.0"
- },
- "backports.csv": {
- "hashes": [
- "sha256:1277dfff73130b2e106bf3dd347adb3c5f6c4340882289d88f31240da92cbd6d",
- "sha256:21f6e09bab589e6c1f877edbc40277b65e626262a86e69a70137db714eaac5ce"
- ],
- "version": "==1.0.7"
+ "async-timeout": {
+ "hashes": [
+ "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15",
+ "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.0.2"
+ },
+ "backports.zoneinfo": {
+ "hashes": [
+ "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
+ "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328",
+ "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546",
+ "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6",
+ "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570",
+ "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9",
+ "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7",
+ "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987",
+ "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722",
+ "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582",
+ "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc",
+ "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b",
+ "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1",
+ "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08",
+ "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
+ "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
+ ],
+ "markers": "python_version < '3.9' and python_version >= '3.6' and python_version < '3.9'",
+ "version": "==0.2.1"
},
"beautifulsoup4": {
"hashes": [
- "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
- "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
- "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
- ],
- "version": "==4.9.3"
- },
- "black": {
- "hashes": [
- "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b",
- "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"
- ],
- "index": "ia",
- "version": "==19.10b0"
- },
- "blinker": {
- "hashes": [
- "sha256:471aee25f3992bd325afa3772f1063dbdbbca947a041b8b89466dc00d606f8b6"
+ "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
+ "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
],
- "version": "==1.4"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.11.1"
},
"boto3": {
"hashes": [
- "sha256:ba8de10d3ede338d51ae47e428b97dcc1d1b507741aa98697e63e879a147f4aa",
- "sha256:e3f10ed6d9ca98415fdec15c85e50a89ec38d6229bce3fafd5e7965b16c4ebc5"
+ "sha256:7a6766c7177a9c6f85365e02aabd96ca4d72e08bc5cb127cb51b0a97ac9b9d1b",
+ "sha256:82b790b1dabd0746b028d2013b5d4d636a41f3aaf25520081f4c173cb6eb395d"
],
"index": "ia",
- "version": "==1.16.44"
+ "version": "==1.26.37"
},
"botocore": {
"hashes": [
- "sha256:4ff05bc089ba78a5996f06dcfddf8ca51583e30ce779ed95e9952e90c1907420",
- "sha256:7725e08c95ae96c4dbd955cb4ae44a0c06d3e41f76a7feb0a941c27a44c63113"
+ "sha256:18ab8e95345a6d0d2653ce65d261a0aef6fef8a57a35a89e3cea6ffe315e92fc",
+ "sha256:3afa4fec9f7713caa05116563b38f81bec7bd20585d517155484d3f25efab5aa"
],
- "version": "==1.19.44"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.29.37"
},
"braveblock": {
"hashes": [
- "sha256:02e630baa202d0294702bf0274033abe27ee1f17a40c8e2651594f533db645e3",
- "sha256:1a0a7aff2c36745f60ad2e08be050c6c9b2e5e1eeff4e6d5cf4ea02de8a5eacc",
- "sha256:1bae851fbe20f94055402fb8b854ce7ed393336808bd85c65c36d03e360a9aef",
- "sha256:2a5648640ddb063bfd0987142b678ddf078ac3f6081a91f13cc80ad3c876944f",
- "sha256:2fe6bce6ae47449724be267b4ba8d76d4c897091dda5c51c6aaab242e004118d",
- "sha256:5bad6da1d03cd32ba238055f77a45d2cdb4cf6d2bb06e9c9eb7c0c050d24dd4c",
- "sha256:5eb9eaf9f9d98e787f6473d2b6f4f3420389ded86e68352487e403b57793a7fe",
- "sha256:784be89c29ffedfbde1b4748ccb9c678d16d79c839791fc49e4947051f47ceb0",
- "sha256:a47e34ba4114c3a63b2b2f4cd6bb2a596cfbd5ecee00efb96924b39a84aa080f",
- "sha256:c9a8d4c649ececf2d3d7bcfa11a9d35d68568ac2bbaa2edb9807f8ceeafdc762",
- "sha256:cf54faf4de8bd123e64c85800fde5515b45c265ea5b260cb198233bd83b6c4d9",
- "sha256:f74cc53561827368cf6534004cd05f83e39fdd1d1714b63b9bd7919b299f970f"
+ "sha256:0bfca14473275366f2f822751c4e8dde7f94ee5ce8a9372244870452458f4fe1",
+ "sha256:107050b2e1c885b748727573a54a85d2e1ea9ad86146370f6eb79ca18b9673d4",
+ "sha256:13f9769eac9c4027eba2f400e635572796f7a7feb343f442d13c4b78e7d6f536",
+ "sha256:14efeada36418525da7c3b26393041b85242ffa1165328ec7eaf9b9780b72d62",
+ "sha256:1ab6980d10b8a02fd0dc73e28f18a0a3e17be636d314c1fdaa3bbb3e36a81f0f",
+ "sha256:45286418a43a3dfab50bdaf922f5003dbd2c3d1f696d23883568f4fa14b8093e",
+ "sha256:66c2442154102bff8df9c6f05cb72cd5cda6f4e1ed88592800ab1b6e8100e806",
+ "sha256:73de4f925ae5442d3361a71d7c0eeb1b4c540bf3d0c91100a00325ccef9e743c",
+ "sha256:80cbeeb6d083bc2a9106214188e5ce05362f248c1051344dc6673b7b38a561da",
+ "sha256:8460b10c9b82cc9d0b6056e1fe206bea209fe5a83ba87bdf9486305657224a44",
+ "sha256:903c506fc05eb6b76e4d31f957c1118078582db80f8ef5ce5ac74418f094d498",
+ "sha256:dcb773e3e275de896efebe57159a67587283d6ca1d1a36695170a3756fd2ef3a"
],
"index": "ia",
- "version": "==0.1.9"
+ "version": "==0.3.0"
},
"brotli": {
"hashes": [
+ "sha256:02177603aaca36e1fd21b091cb742bb3b305a569e2402f1ca38af471777fb019",
+ "sha256:11d3283d89af7033236fa4e73ec2cbe743d4f6a81d41bd234f24bf63dde979df",
+ "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
+ "sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
"sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
"sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
"sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
+ "sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
+ "sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
+ "sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
+ "sha256:3148362937217b7072cf80a2dcc007f09bb5ecb96dae4617316638194113d5be",
+ "sha256:330e3f10cd01da535c70d09c4283ba2df5fb78e915bea0a28becad6e2ac010be",
+ "sha256:336b40348269f9b91268378de5ff44dc6fbaa2268194f85177b53463d313842a",
+ "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
+ "sha256:3b8b09a16a1950b9ef495a0f8b9d0a87599a9d1f179e2d4ac014b2ec831f87e7",
+ "sha256:3c1306004d49b84bd0c4f90457c6f57ad109f5cc6067a9664e12b7b79a9948ad",
+ "sha256:3ffaadcaeafe9d30a7e4e1e97ad727e4f5610b9fa2f7551998471e3736738679",
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
+ "sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
+ "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
+ "sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
"sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
"sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
+ "sha256:5bf37a08493232fbb0f8229f1824b366c2fc1d02d64e7e918af40acd15f3e337",
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
+ "sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
"sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
+ "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
+ "sha256:73fd30d4ce0ea48010564ccee1a26bfe39323fde05cb34b5863455629db61dc7",
+ "sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
+ "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
+ "sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
+ "sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
+ "sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
+ "sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
+ "sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
+ "sha256:8ed6a5b3d23ecc00ea02e1ed8e0ff9a08f4fc87a1f58a2530e71c0f48adf882f",
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
+ "sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
+ "sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
"sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
+ "sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
"sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
+ "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
+ "sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
+ "sha256:b1375b5d17d6145c798661b67e4ae9d5496920d9265e2f00f1c2c0b5ae91fbde",
+ "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
+ "sha256:b3523f51818e8f16599613edddb1ff924eeb4b53ab7e7197f85cbc321cdca32f",
+ "sha256:b43775532a5904bc938f9c15b77c613cb6ad6fb30990f3b0afaea82797a402d8",
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
+ "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
+ "sha256:ba72d37e2a924717990f4d7482e8ac88e2ef43fb95491eb6e0d124d77d2a150d",
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
+ "sha256:c8e521a0ce7cf690ca84b8cc2272ddaf9d8a50294fd086da67e517439614c755",
+ "sha256:cab1b5964b39607a66adbba01f1c12df2e55ac36c81ec6ed44f2fca44178bf1a",
+ "sha256:cb02ed34557afde2d2da68194d12f5719ee96cfb2eacc886352cb73e3808fc5d",
+ "sha256:cc0283a406774f465fb45ec7efb66857c09ffefbe49ec20b7882eff6d3c86d3a",
+ "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
+ "sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
+ "sha256:e1abbeef02962596548382e393f56e4c94acd286bd0c5afba756cffc33670e8a",
+ "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
+ "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
+ "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
+ "sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
+ "sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
"sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
],
"version": "==1.0.9"
@@ -141,98 +189,141 @@
},
"certifi": {
"hashes": [
- "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
- "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2020.12.5"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
"chardet": {
"hashes": [
- "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
- "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
+ "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5",
+ "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"
],
- "version": "==4.0.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.1.0"
},
- "click": {
+ "charset-normalizer": {
"hashes": [
- "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a",
- "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "version": "==7.1.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
"configparser": {
"hashes": [
- "sha256:005c3b102c96f4be9b8f40dafbd4997db003d07d1caa19f37808be8031475f2a",
- "sha256:08e8a59ef1817ac4ed810bb8e17d049566dd6e024e7566f6285c756db2bb4ff8"
+ "sha256:8be267824b541c09b08db124917f48ab525a6c3e837011f3130781a224c57090",
+ "sha256:b065779fd93c6bf4cee42202fa4351b4bb842e96a3fb469440e484517a49b9fa"
],
- "version": "==5.0.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.3.0"
},
"confluent-kafka": {
"hashes": [
- "sha256:00acc73f7d49961bf427f5e4fd6c0a220a6bfa5ccc91e0ad1f9ffa1751a169b0",
- "sha256:0a59afbb90bdd22b9acdd3bb134f5ee1dff3cc5df55eaf52bf97b2f8d0d00de3",
- "sha256:13b0e2011560f461ff39daf38089dd7f91404b3e66dba0456ccce0700f93c4f2",
- "sha256:175c7064c8f19975616974558c45f42c147a202d4b1c0b0a83afefb920367696",
- "sha256:22d7201d1aa89f1c5546749e781492925ed3eb0d7bd8f781fc57294cd45ddde3",
- "sha256:3034cacc3b0d03eb3ce39cc5a64c1070d223870246f5d90c9113996be9db7df8",
- "sha256:3e2d4f55ca952aeada3831d6615dc13a8a42c8e97175855ca08bbc6e6091b080",
- "sha256:5a1c47320d6afc5b2599f8f8e143aed6845a2d903facde984606e02f10f11221",
- "sha256:7b03bd9cc7b5e4df0a27eed359762c61a35313d4981ef1d9b418069eee454e66",
- "sha256:85ff4823770ce2efaabb46d88e5ae26a840e0051fd481abaa805f21a5a84d003",
- "sha256:9534cd2c0313df75b70eb4cf729382998970d97bbdda5cf3aef7081b855ccebe",
- "sha256:99b13d0957a5967c85aee6138ef5f9acec90294267a549c5683744f20cf5d7b4",
- "sha256:9a1c77291c1ac4b991aa0358f2f44636686eb8f52fb628502d30c312160a14e9",
- "sha256:9ac812006000887f76c95b8a33a9f0b65845bf072fbc54a42a1acffd34e41120",
- "sha256:9c47b8aacfe347bffd86bf75b98626718912b63df87f256dff1abc06a0355410",
- "sha256:a116382ae67e0d6a54684bab4ee9b1be54e789d031a6e5e74c3edc657c79d23c",
- "sha256:b1c89f3653385acc5da71570e03281f35ac6960367f2b2a426ae431deb1a1a35",
- "sha256:bb77276d569f511abe4a5b32a53f8a30285bc7be68219e5711a44720bf356ac2",
- "sha256:bbd9633552840ab9367fb762ea21272759db8caec2c34ff16ee28be177644cdf",
- "sha256:bfdfa81e4e72d2c24e408a5e199aae0a477499ae40647dfa6906d002d9b07f38",
- "sha256:c7461d6db081c23a6d38ceba348e7c178d7e974cf22c45ba8a4918ecb8855a44",
- "sha256:d6a5d4c72360a75e875e88f7cce42b66a786d037ca2002303ab1c580d49caf53",
- "sha256:dabed41cc60d1fc6d3cb44a90fe02e5192c9bf0f73c7b35761981e62ecabc592",
- "sha256:dd544847c713eeeb525031348ff6ffea4ecdd11c13590893e599a9d4676a9bd4",
- "sha256:eba169a9de8c978c9f33c763857c5279eceac46a4fd55a381c2528b9d4b3359e",
- "sha256:f2d1ee0bfdf618017bbfaa42406546155c1a86263e4f286295318578c723803b"
+ "sha256:24872e3e427b16f77461ae7e6cf48f9c5c03c06884ac51bad179580a4dd29145",
+ "sha256:2fb97bd25d436bd59fe079885aa77a3a2f23cface9c6359d4700053665849262",
+ "sha256:3207c76d1510571cbda85560c293dec5f8d6645103b3f471abab5c83e51a7ccd",
+ "sha256:344a7fec57d3348002392a7bd5cf66fb9dbe4a103e81636037cccd6fff944e28",
+ "sha256:382739e499deaf488459c2307ebcc0e9b3653340801d6053c207c84ad710ee8d",
+ "sha256:4d6bfcc352cd608fcf325037b4425c0edaeae0c6a5439423a865110b59f897e9",
+ "sha256:4f27ddf7daf630a95e1d7dfddd0c8cf8a7755c9567dc9851bf2e74c22f34af42",
+ "sha256:5b24587b30a4d288a7b1c5cc756ee707fc1293fa28454f8db40267ed9d7e73c8",
+ "sha256:6ab745babc33a864e3ca3a2659c005ed52503e39936fff5812eeb21920009c8b",
+ "sha256:7e6592533b3f8cfbc086ea2d472058f10e5f6a04a388edca01773285c63284b4",
+ "sha256:b9ad6ad9d58c2735129f94f044b2236d7de87d77a101c8c630363add12d62a4a",
+ "sha256:be7b37020f614017d8a047565b3fb61ceef9c30a9ee093f9373d06a4c32068ae",
+ "sha256:bef263b6d78a3e63399e1b82cc07cbb30af762884df96a369cba0e1011405344",
+ "sha256:c4b7c4d0b647952d2b506948131d6e7e1c42ccb16aac8e3e52369c16b94e7215",
+ "sha256:d036bf5e1d7cb3743125d7caf62b1a23b12e403de240144b6117ddbb8f815a33",
+ "sha256:d0cbf8e7510497afd651e134bccb9d579aa90234e45734046fcb6b752d2ee312",
+ "sha256:d533ea0e527122f177943ee35eb356b8d9f7af35fe357e0cdc0514d95804aaea",
+ "sha256:e41b9313c44f54a3cd29b0e95fa32a8e685edaa9287b338f59530b21ebc0b453",
+ "sha256:e9107767cc9240cbf9b5c0fdded5eeead86a1690d1c15de6cbbdcc9d7e3b1962",
+ "sha256:f96033c335da26ea1716ab9adfce459c211b023ca09528f958fb28bf099fc0df",
+ "sha256:f970a2c6d22c934ea68d645abcc96056ecb107489f28a38b2171f65655b7e41f",
+ "sha256:fe31b3b6930d67380df371f5088950f93da5fac580cde3bedb35f992b2498e1b",
+ "sha256:ff08b9f978f8b37f2961614a68f9fdb4fabd10cdd940234e80200806d93a1c30",
+ "sha256:ff4d1557b7fb72e752c36205a344863b8f4f23b3a834780fc36eb7ebde614de7"
],
"index": "ia",
- "version": "==1.5.0"
+ "version": "==1.9.2"
},
"contextlib2": {
"hashes": [
- "sha256:01f490098c18b19d2bd5bb5dc445b2054d2fa97f09a4280ba2c5f3c394c8162e",
- "sha256:3355078a159fbb44ee60ea80abd0d87b80b78c248643b49aa6d94673b413609b"
+ "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f",
+ "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869"
],
- "version": "==0.6.0.post1"
+ "markers": "python_version >= '3.6'",
+ "version": "==21.6.0"
},
"courlan": {
"hashes": [
- "sha256:16b22e6b98838469793ce6c4b9501d7a7eff679c227a4d3c135349d1da12f623",
- "sha256:649756066671c1fdcbef129766300aa1b1c5b2cf5bcdedcb0aadcd7f09cd5e6b"
+ "sha256:d06c5b048b2b5cd5c0ac77304dc24b795e4bb257a7b6077ea405a3b5e99ae179",
+ "sha256:d141d30f8e52d344cf9904aa29e4d8750e934026bdbca2dc7bd58b750566d058"
],
- "version": "==0.2.3"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
},
"crawllib": {
"hashes": [
- "sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea"
+ "sha256:9a30a10318dc706f1e27ff0af950ac14a77f73c18d329771f44d872fd63630e3"
],
- "version": "==0.1.4.8"
+ "version": "==0.1.6"
},
- "cssselect": {
- "hashes": [
- "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
- "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
- ],
- "version": "==1.1.0"
+ "cython": {
+ "hashes": [
+ "sha256:061e25151c38f2361bc790d3bcf7f9d9828a0b6a4d5afa56fbed3bd33fb2373a",
+ "sha256:06be83490c906b6429b4389e13487a26254ccaad2eef6f3d4ee21d8d3a4aaa2b",
+ "sha256:07d173d3289415bb496e72cb0ddd609961be08fe2968c39094d5712ffb78672b",
+ "sha256:0bbc27abdf6aebfa1bce34cd92bd403070356f28b0ecb3198ff8a182791d58b9",
+ "sha256:0ea8267fc373a2c5064ad77d8ff7bf0ea8b88f7407098ff51829381f8ec1d5d9",
+ "sha256:3875c2b2ea752816a4d7ae59d45bb546e7c4c79093c83e3ba7f4d9051dd02928",
+ "sha256:39afb4679b8c6bf7ccb15b24025568f4f9b4d7f9bf3cbd981021f542acecd75b",
+ "sha256:3f85eb2343d20d91a4ea9cf14e5748092b376a64b7e07fc224e85b2753e9070b",
+ "sha256:40eff7aa26e91cf108fd740ffd4daf49f39b2fdffadabc7292b4b7dc5df879f0",
+ "sha256:479690d2892ca56d34812fe6ab8f58e4b2e0129140f3d94518f15993c40553da",
+ "sha256:4a4b03ab483271f69221c3210f7cde0dcc456749ecf8243b95bc7a701e5677e0",
+ "sha256:513e9707407608ac0d306c8b09d55a28be23ea4152cbd356ceaec0f32ef08d65",
+ "sha256:5514f3b4122cb22317122a48e175a7194e18e1803ca555c4c959d7dfe68eaf98",
+ "sha256:5ba622326f2862f9c1f99ca8d47ade49871241920a352c917e16861e25b0e5c3",
+ "sha256:63b79d9e1f7c4d1f498ab1322156a0d7dc1b6004bf981a8abda3f66800e140cd",
+ "sha256:656dc5ff1d269de4d11ee8542f2ffd15ab466c447c1f10e5b8aba6f561967276",
+ "sha256:67fdd2f652f8d4840042e2d2d91e15636ba2bcdcd92e7e5ffbc68e6ef633a754",
+ "sha256:79e3bab19cf1b021b613567c22eb18b76c0c547b9bc3903881a07bfd9e7e64cf",
+ "sha256:856d2fec682b3f31583719cb6925c6cdbb9aa30f03122bcc45c65c8b6f515754",
+ "sha256:8669cadeb26d9a58a5e6b8ce34d2c8986cc3b5c0bfa77eda6ceb471596cb2ec3",
+ "sha256:8733cf4758b79304f2a4e39ebfac5e92341bce47bcceb26c1254398b2f8c1af7",
+ "sha256:97335b2cd4acebf30d14e2855d882de83ad838491a09be2011745579ac975833",
+ "sha256:afbce249133a830f121b917f8c9404a44f2950e0e4f5d1e68f043da4c2e9f457",
+ "sha256:b0595aee62809ba353cebc5c7978e0e443760c3e882e2c7672c73ffe46383673",
+ "sha256:b6da3063c5c476f5311fd76854abae6c315f1513ef7d7904deed2e774623bbb9",
+ "sha256:c8e8025f496b5acb6ba95da2fb3e9dacffc97d9a92711aacfdd42f9c5927e094",
+ "sha256:cddc47ec746a08603037731f5d10aebf770ced08666100bd2cdcaf06a85d4d1b",
+ "sha256:cdf10af3e2e3279dc09fdc5f95deaa624850a53913f30350ceee824dc14fc1a6",
+ "sha256:d968ffc403d92addf20b68924d95428d523436adfd25cf505d427ed7ba3bee8b",
+ "sha256:dbee03b8d42dca924e6aa057b836a064c769ddfd2a4c2919e65da2c8a362d528",
+ "sha256:e1958e0227a4a6a2c06fd6e35b7469de50adf174102454db397cec6e1403cce3",
+ "sha256:e6ffa08aa1c111a1ebcbd1cf4afaaec120bc0bbdec3f2545f8bb7d3e8e77a1cd",
+ "sha256:e83228e0994497900af954adcac27f64c9a57cd70a9ec768ab0cb2c01fd15cf1",
+ "sha256:ea1dcc07bfb37367b639415333cfbfe4a93c3be340edf1db10964bc27d42ed64",
+ "sha256:eca3065a1279456e81c615211d025ea11bfe4e19f0c5650b859868ca04b3fcbd",
+ "sha256:ed087eeb88a8cf96c60fb76c5c3b5fb87188adee5e179f89ec9ad9a43c0c54b3",
+ "sha256:eeb475eb6f0ccf6c039035eb4f0f928eb53ead88777e0a760eccb140ad90930b",
+ "sha256:eefd2b9a5f38ded8d859fe96cc28d7d06e098dc3f677e7adbafda4dcdd4a461c",
+ "sha256:f3fd44cc362eee8ae569025f070d56208908916794b6ab21e139cea56470a2b3",
+ "sha256:f9944013588a3543fca795fffb0a070a31a243aa4f2d212f118aa95e69485831"
+ ],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==0.29.32"
},
"dateparser": {
"hashes": [
- "sha256:159cc4e01a593706a15cd4e269a0b3345edf3aef8bf9278a57dac8adf5bf1e4a",
- "sha256:17202df32c7a36e773136ff353aa3767e987f8b3e27374c39fd21a30a803d6f8"
+ "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c",
+ "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e"
],
"index": "ia",
- "version": "==1.0.0"
+ "version": "==1.1.4"
},
"dawg": {
"hashes": [
@@ -251,10 +342,11 @@
},
"decorator": {
"hashes": [
- "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
- "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
],
- "version": "==4.4.2"
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
},
"docopt": {
"hashes": [
@@ -270,54 +362,58 @@
},
"dynaconf": {
"hashes": [
- "sha256:808adfe964f10695846dbf8dad7632e47fc3bc38860fd1887ed57dddffc4eff2",
- "sha256:9b34ab2f811a81755f5eb4beac77a69e1e0887528c7e37fc4bc83fed52dcf502"
+ "sha256:87e0b3b12b5db9e8fb465e1f8c7fdb926cd2ec5b6d88aa7f821f316df93fb165",
+ "sha256:d9cfb50fd4a71a543fd23845d4f585b620b6ff6d9d3cc1825c614f7b2097cb39"
],
"index": "ia",
- "version": "==3.1.2"
+ "version": "==3.1.11"
},
"elasticsearch": {
"hashes": [
- "sha256:4ebd34fd223b31c99d9f3b6b6236d3ac18b3046191a37231e8235b06ae7db955",
- "sha256:a725dd923d349ca0652cf95d6ce23d952e2153740cf4ab6daf4a2d804feeed48"
+ "sha256:840adeb45a5ec9102a83f3cf481aae83a3775b75d6dd83a7310b04e44a5d0308",
+ "sha256:f511ea92e96db09b0e96b0de5fbbb7aa5c3740b0c571a364a2c3a1cc7ec06203"
],
- "version": "==7.10.1"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'",
+ "version": "==7.17.8"
},
"filelock": {
"hashes": [
- "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59",
- "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"
+ "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2",
+ "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c"
],
- "version": "==3.0.12"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.8.2"
},
- "flask": {
+ "ftfy": {
"hashes": [
- "sha256:4efa1ae2d7c9865af48986de8aeb8504bf32c7f3d6fdc9353d34b21f4b127060",
- "sha256:8a4fdd8936eba2512e9c85df320a37e694c93945b33ef33c89946a340a238557"
+ "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca",
+ "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"
],
"index": "ia",
- "version": "==1.1.2"
+ "version": "==6.1.1"
},
- "ftfy": {
+ "globalwayback": {
"hashes": [
- "sha256:51c7767f8c4b47d291fcef30b9625fb5341c06a31e6a3b627039c706c42f3720"
+ "sha256:683f19dee720ef11335952aa33615e50c945196c82e18a5d8150635f92022d23"
],
"index": "ia",
- "version": "==5.8"
+ "version": "==0.8.12.6"
},
- "globalwayback": {
+ "grobid-tei-xml": {
"hashes": [
- "sha256:579822d0521ceb0427b9e617ff5355ff5742c9927d571d11711954f761d7e81a"
+ "sha256:022fdf54dbd067b520c1effe3c1a1f2ac248492ea310627e9462757748cb461b",
+ "sha256:35c9afb14f6f76100dce5f5815e67ec9fa4122e2f268394e0baf6eafbd8668d8"
],
"index": "ia",
- "version": "==0.6.8"
+ "version": "==0.1.3"
},
"htmldate": {
"hashes": [
- "sha256:03f4e9648bf5bade11ecdb2a825a06019fafbfdcafd88151a4ce0407325f43c7",
- "sha256:2e383fdbac3fb8a3cc6307502d7b920bb10f938113a1d108ec315aa195a2bc28"
+ "sha256:603b86eaf0f076efcd653d57fe0470305f751417711f4e373279235d0ff587e6",
+ "sha256:83830715faf0f22272d9e24e571a4955308a008107d0ca9359c0de77b99766cd"
],
- "version": "==0.7.2"
+ "index": "ia",
+ "version": "==1.3.2"
},
"ialib": {
"hashes": [
@@ -334,149 +430,201 @@
},
"internetarchive": {
"hashes": [
- "sha256:0e9b24577086283280a5089b3e65086640b4e42d61ca4af913639f45b02b9e4c",
- "sha256:bf28ab57939a80a61c2cf66bb7173ea1989013494dab564c99035574d5b4faea"
+ "sha256:de856465c2ef6852184d08bfd59c0ca01904865b373a27b383034ac6b4128eb6"
],
"index": "ia",
- "version": "==1.9.6"
- },
- "itsdangerous": {
- "hashes": [
- "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
- "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
- ],
- "version": "==1.1.0"
+ "version": "==3.0.2"
},
"jinja2": {
"hashes": [
- "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0",
- "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"
+ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
+ "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
- "version": "==2.11.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.1.2"
},
"jmespath": {
"hashes": [
- "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
- "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
+ "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980",
+ "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"
],
- "version": "==0.10.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.0.1"
},
"jsonpatch": {
"hashes": [
- "sha256:da3831be60919e8c98564acfc1fa918cb96e7c9750b0428388483f04d0d1c5a7",
- "sha256:e930adc932e4d36087dbbf0f22e1ded32185dfb20662f2e3dd848677a5295a14"
+ "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397",
+ "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"
],
- "version": "==1.28"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==1.32"
},
"jsonpointer": {
"hashes": [
- "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362",
- "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e"
+ "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9",
+ "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"
],
- "version": "==2.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==2.3"
},
"justext": {
"hashes": [
- "sha256:330035dfaaa960465276afa1836dfb6e63791011a8dfc6da2757142cc4d14d54",
- "sha256:4b8b7f0749e8725f0089ebe0239c1a45286d61bf507b3f05d136c2700dea4aa6"
+ "sha256:7640e248218795f6be65f6c35fe697325a3280fcb4675d1525bcdff2b86faadf",
+ "sha256:86b48f5b1d99505acd072f5831def6cd3f1306043651c524a1c609e62e3544e4"
],
- "version": "==2.2.0"
+ "version": "==3.0.0"
},
- "lxml": {
+ "langcodes": {
"hashes": [
- "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
- "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
- "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
- "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
- "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
- "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
- "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
- "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
- "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
- "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
- "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
- "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
- "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
- "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
- "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
- "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
- "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
- "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
- "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
- "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
- "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
- "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
- "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
- "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
- "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
- "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
- "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
- "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
- "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
- "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
- "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
- "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
- "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
- "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
- "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
- "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
- "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
+ "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69",
+ "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"
],
- "markers": "python_version >= '3.5'",
- "version": "==4.6.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==3.3.0"
+ },
+ "lxml": {
+ "hashes": [
+ "sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7",
+ "sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726",
+ "sha256:05ca3f6abf5cf78fe053da9b1166e062ade3fa5d4f92b4ed688127ea7d7b1d03",
+ "sha256:090c6543d3696cbe15b4ac6e175e576bcc3f1ccfbba970061b7300b0c15a2140",
+ "sha256:0dc313ef231edf866912e9d8f5a042ddab56c752619e92dfd3a2c277e6a7299a",
+ "sha256:0f2b1e0d79180f344ff9f321327b005ca043a50ece8713de61d1cb383fb8ac05",
+ "sha256:13598ecfbd2e86ea7ae45ec28a2a54fb87ee9b9fdb0f6d343297d8e548392c03",
+ "sha256:16efd54337136e8cd72fb9485c368d91d77a47ee2d42b057564aae201257d419",
+ "sha256:1ab8f1f932e8f82355e75dda5413a57612c6ea448069d4fb2e217e9a4bed13d4",
+ "sha256:223f4232855ade399bd409331e6ca70fb5578efef22cf4069a6090acc0f53c0e",
+ "sha256:2455cfaeb7ac70338b3257f41e21f0724f4b5b0c0e7702da67ee6c3640835b67",
+ "sha256:2899456259589aa38bfb018c364d6ae7b53c5c22d8e27d0ec7609c2a1ff78b50",
+ "sha256:2a29ba94d065945944016b6b74e538bdb1751a1db6ffb80c9d3c2e40d6fa9894",
+ "sha256:2a87fa548561d2f4643c99cd13131acb607ddabb70682dcf1dff5f71f781a4bf",
+ "sha256:2e430cd2824f05f2d4f687701144556646bae8f249fd60aa1e4c768ba7018947",
+ "sha256:36c3c175d34652a35475a73762b545f4527aec044910a651d2bf50de9c3352b1",
+ "sha256:3818b8e2c4b5148567e1b09ce739006acfaa44ce3156f8cbbc11062994b8e8dd",
+ "sha256:3ab9fa9d6dc2a7f29d7affdf3edebf6ece6fb28a6d80b14c3b2fb9d39b9322c3",
+ "sha256:3efea981d956a6f7173b4659849f55081867cf897e719f57383698af6f618a92",
+ "sha256:4c8f293f14abc8fd3e8e01c5bd86e6ed0b6ef71936ded5bf10fe7a5efefbaca3",
+ "sha256:5344a43228767f53a9df6e5b253f8cdca7dfc7b7aeae52551958192f56d98457",
+ "sha256:58bfa3aa19ca4c0f28c5dde0ff56c520fbac6f0daf4fac66ed4c8d2fb7f22e74",
+ "sha256:5b4545b8a40478183ac06c073e81a5ce4cf01bf1734962577cf2bb569a5b3bbf",
+ "sha256:5f50a1c177e2fa3ee0667a5ab79fdc6b23086bc8b589d90b93b4bd17eb0e64d1",
+ "sha256:63da2ccc0857c311d764e7d3d90f429c252e83b52d1f8f1d1fe55be26827d1f4",
+ "sha256:6749649eecd6a9871cae297bffa4ee76f90b4504a2a2ab528d9ebe912b101975",
+ "sha256:6804daeb7ef69e7b36f76caddb85cccd63d0c56dedb47555d2fc969e2af6a1a5",
+ "sha256:689bb688a1db722485e4610a503e3e9210dcc20c520b45ac8f7533c837be76fe",
+ "sha256:699a9af7dffaf67deeae27b2112aa06b41c370d5e7633e0ee0aea2e0b6c211f7",
+ "sha256:6b418afe5df18233fc6b6093deb82a32895b6bb0b1155c2cdb05203f583053f1",
+ "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2",
+ "sha256:7b515674acfdcadb0eb5d00d8a709868173acece5cb0be3dd165950cbfdf5409",
+ "sha256:7b770ed79542ed52c519119473898198761d78beb24b107acf3ad65deae61f1f",
+ "sha256:7d2278d59425777cfcb19735018d897ca8303abe67cc735f9f97177ceff8027f",
+ "sha256:7e91ee82f4199af8c43d8158024cbdff3d931df350252288f0d4ce656df7f3b5",
+ "sha256:821b7f59b99551c69c85a6039c65b75f5683bdc63270fec660f75da67469ca24",
+ "sha256:822068f85e12a6e292803e112ab876bc03ed1f03dddb80154c395f891ca6b31e",
+ "sha256:8340225bd5e7a701c0fa98284c849c9b9fc9238abf53a0ebd90900f25d39a4e4",
+ "sha256:85cabf64adec449132e55616e7ca3e1000ab449d1d0f9d7f83146ed5bdcb6d8a",
+ "sha256:880bbbcbe2fca64e2f4d8e04db47bcdf504936fa2b33933efd945e1b429bea8c",
+ "sha256:8d0b4612b66ff5d62d03bcaa043bb018f74dfea51184e53f067e6fdcba4bd8de",
+ "sha256:8e20cb5a47247e383cf4ff523205060991021233ebd6f924bca927fcf25cf86f",
+ "sha256:925073b2fe14ab9b87e73f9a5fde6ce6392da430f3004d8b72cc86f746f5163b",
+ "sha256:998c7c41910666d2976928c38ea96a70d1aa43be6fe502f21a651e17483a43c5",
+ "sha256:9b22c5c66f67ae00c0199f6055705bc3eb3fcb08d03d2ec4059a2b1b25ed48d7",
+ "sha256:9f102706d0ca011de571de32c3247c6476b55bb6bc65a20f682f000b07a4852a",
+ "sha256:a08cff61517ee26cb56f1e949cca38caabe9ea9fbb4b1e10a805dc39844b7d5c",
+ "sha256:a0a336d6d3e8b234a3aae3c674873d8f0e720b76bc1d9416866c41cd9500ffb9",
+ "sha256:a35f8b7fa99f90dd2f5dc5a9fa12332642f087a7641289ca6c40d6e1a2637d8e",
+ "sha256:a38486985ca49cfa574a507e7a2215c0c780fd1778bb6290c21193b7211702ab",
+ "sha256:a5da296eb617d18e497bcf0a5c528f5d3b18dadb3619fbdadf4ed2356ef8d941",
+ "sha256:a6e441a86553c310258aca15d1c05903aaf4965b23f3bc2d55f200804e005ee5",
+ "sha256:a82d05da00a58b8e4c0008edbc8a4b6ec5a4bc1e2ee0fb6ed157cf634ed7fa45",
+ "sha256:ab323679b8b3030000f2be63e22cdeea5b47ee0abd2d6a1dc0c8103ddaa56cd7",
+ "sha256:b1f42b6921d0e81b1bcb5e395bc091a70f41c4d4e55ba99c6da2b31626c44892",
+ "sha256:b23e19989c355ca854276178a0463951a653309fb8e57ce674497f2d9f208746",
+ "sha256:b264171e3143d842ded311b7dccd46ff9ef34247129ff5bf5066123c55c2431c",
+ "sha256:b26a29f0b7fc6f0897f043ca366142d2b609dc60756ee6e4e90b5f762c6adc53",
+ "sha256:b64d891da92e232c36976c80ed7ebb383e3f148489796d8d31a5b6a677825efe",
+ "sha256:b9cc34af337a97d470040f99ba4282f6e6bac88407d021688a5d585e44a23184",
+ "sha256:bc718cd47b765e790eecb74d044cc8d37d58562f6c314ee9484df26276d36a38",
+ "sha256:be7292c55101e22f2a3d4d8913944cbea71eea90792bf914add27454a13905df",
+ "sha256:c83203addf554215463b59f6399835201999b5e48019dc17f182ed5ad87205c9",
+ "sha256:c9ec3eaf616d67db0764b3bb983962b4f385a1f08304fd30c7283954e6a7869b",
+ "sha256:ca34efc80a29351897e18888c71c6aca4a359247c87e0b1c7ada14f0ab0c0fb2",
+ "sha256:ca989b91cf3a3ba28930a9fc1e9aeafc2a395448641df1f387a2d394638943b0",
+ "sha256:d02a5399126a53492415d4906ab0ad0375a5456cc05c3fc0fc4ca11771745cda",
+ "sha256:d17bc7c2ccf49c478c5bdd447594e82692c74222698cfc9b5daae7ae7e90743b",
+ "sha256:d5bf6545cd27aaa8a13033ce56354ed9e25ab0e4ac3b5392b763d8d04b08e0c5",
+ "sha256:d6b430a9938a5a5d85fc107d852262ddcd48602c120e3dbb02137c83d212b380",
+ "sha256:da248f93f0418a9e9d94b0080d7ebc407a9a5e6d0b57bb30db9b5cc28de1ad33",
+ "sha256:da4dd7c9c50c059aba52b3524f84d7de956f7fef88f0bafcf4ad7dde94a064e8",
+ "sha256:df0623dcf9668ad0445e0558a21211d4e9a149ea8f5666917c8eeec515f0a6d1",
+ "sha256:e5168986b90a8d1f2f9dc1b841467c74221bd752537b99761a93d2d981e04889",
+ "sha256:efa29c2fe6b4fdd32e8ef81c1528506895eca86e1d8c4657fda04c9b3786ddf9",
+ "sha256:f1496ea22ca2c830cbcbd473de8f114a320da308438ae65abad6bab7867fe38f",
+ "sha256:f49e52d174375a7def9915c9f06ec4e569d235ad428f70751765f48d5926678c"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==4.9.2"
},
"markupsafe": {
"hashes": [
- "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
- "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
- "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
- "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
- "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
- "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
- "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
- "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
- "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
- "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
- "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
- "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
- "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
- "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
- "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
- "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
- "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
- "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
- "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
- "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
- "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
- "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
- "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
- "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
- "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
- "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
- "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
- "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
- "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
- "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
- "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
- "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
- "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
- ],
- "version": "==1.1.1"
+ "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
+ "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
+ "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
+ "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
+ "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
+ "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
+ "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
+ "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
+ "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
+ "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
+ "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
+ "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
+ "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
+ "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
+ "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
+ "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
+ "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
+ "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
+ "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
+ "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
+ "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
+ "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
+ "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
+ "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
+ "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
+ "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
+ "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
+ "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
+ "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
+ "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
+ "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
+ "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
+ "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
+ "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
+ "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
+ "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
+ "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
+ "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
+ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
+ "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.1.1"
},
"minio": {
"hashes": [
- "sha256:bae5d060796ba3eb8dbdd2bbb7dc83da58ec4aca284689489467cacdc4a8b112",
- "sha256:f96d7f461ff88d7df2932eecf55f955ed93874c7ed84bc9454b2d7955de768fe"
+ "sha256:7cb075b56bac894551304cb824f958069a84e0dd2d0a685f9bed3c05e15727bf",
+ "sha256:acae9bfae0aec1b92025bd63e18135ebb4994c84600716c5323e14cb0c9a0b03",
+ "sha256:eec4ab073ff979c34e928e532d8acc1d40d61ba4404709cf27ab3ecdcfa2a561"
],
"index": "ia",
- "version": "==7.0.0"
+ "version": "==6.0.2"
},
- "pathspec": {
+ "perfstat": {
"hashes": [
- "sha256:86379d6b86d75816baba717e64b1a3a3469deb93bb76d613c9ce79edc5cb68fd",
- "sha256:aa0cb481c4041bf52ffa7b0d8fa6cd3e88a2ca4879c533c9153882ee2556790d"
+ "sha256:4f91fab9be6076972c66fe818eed488be28f1044009237adccce42ff2c7861f5"
],
- "version": "==0.8.1"
+ "version": "==0.1.0.1"
},
"pillow": {
"hashes": [
@@ -535,29 +683,22 @@
},
"psycopg2": {
"hashes": [
- "sha256:00195b5f6832dbf2876b8bf77f12bdce648224c89c880719c745b90515233301",
- "sha256:068115e13c70dc5982dfc00c5d70437fe37c014c808acce119b5448361c03725",
- "sha256:17d50b4966818e09e7221f2d64667a7a2fbb43cff6210d6fb6236a16fe8fc622",
- "sha256:26e7fd115a6db75267b325de0fba089b911a4a12ebd3d0b5e7acb7028bc46821",
- "sha256:2c93d4d16933fea5bbacbe1aaf8fa8c1348740b2e50b3735d1b0bf8154cbf0f3",
- "sha256:56007a226b8e95aa980ada7abdea6b40b75ce62a433bd27cec7a8178d57f4051",
- "sha256:56fee7f818d032f802b8eed81ef0c1232b8b42390df189cab9cfa87573fe52c5",
- "sha256:6a3d9efb6f36f1fe6aa8dbb5af55e067db802502c55a9defa47c5a1dad41df84",
- "sha256:6c237e85e534045ea8e9a49ba57fe1c362b564aaac4940083fef7e74c6bf64cc",
- "sha256:7a02112996b0dd47a5a2b13c67b301284ebcc68ce7f4881d1f97f8598fe6f1f5",
- "sha256:7e82d44fc5327d0e6b0f7428bc572a645e6cfa8647860ce1da0d262e548ad921",
- "sha256:a49833abfdede8985ba3f3ec641f771cca215479f41523e99dace96d5b8cce2a",
- "sha256:ad2fe8a37be669082e61fb001c185ffb58867fdbb3e7a6b0b0d2ffe232353a3e",
- "sha256:b8cae8b2f022efa1f011cc753adb9cbadfa5a184431d09b273fb49b4167561ad",
- "sha256:d160744652e81c80627a909a0e808f3c6653a40af435744de037e3172cf277f5",
- "sha256:d5062ae50b222da28253059880a871dc87e099c25cb68acf613d9d227413d6f7",
- "sha256:f155cf65726e4afc2316028fcc5791a1bf384cf2c96562b8b97f18c1fb64f272",
- "sha256:f22ea9b67aea4f4a1718300908a2fb62b3e4276cf00bd829a97ab5894af42ea3",
- "sha256:f974c96fca34ae9e4f49839ba6b78addf0346777b46c4da27a7bf54f48d3057d",
- "sha256:fb23f6c71107c37fd667cb4ea363ddeb936b348bbd6449278eb92c189699f543"
+ "sha256:093e3894d2d3c592ab0945d9eba9d139c139664dcf83a1c440b8a7aa9bb21955",
+ "sha256:190d51e8c1b25a47484e52a79638a8182451d6f6dff99f26ad9bd81e5359a0fa",
+ "sha256:1a5c7d7d577e0eabfcf15eb87d1e19314c8c4f0e722a301f98e0e3a65e238b4e",
+ "sha256:1e5a38aa85bd660c53947bd28aeaafb6a97d70423606f1ccb044a03a1203fe4a",
+ "sha256:322fd5fca0b1113677089d4ebd5222c964b1760e361f151cbb2706c4912112c5",
+ "sha256:4cb9936316d88bfab614666eb9e32995e794ed0f8f6b3b718666c22819c1d7ee",
+ "sha256:920bf418000dd17669d2904472efeab2b20546efd0548139618f8fa305d1d7ad",
+ "sha256:922cc5f0b98a5f2b1ff481f5551b95cd04580fd6f0c72d9b22e6c0145a4840e0",
+ "sha256:a5246d2e683a972e2187a8714b5c2cf8156c064629f9a9b1a873c1730d9e245a",
+ "sha256:b9ac1b0d8ecc49e05e4e182694f418d27f3aedcfca854ebd6c05bb1cffa10d6d",
+ "sha256:d3ef67e630b0de0779c42912fe2cbae3805ebaba30cda27fea2a3de650a9414f",
+ "sha256:f5b6320dbc3cf6cfb9f25308286f9f7ab464e65cfb105b64cc9c52831748ced2",
+ "sha256:fc04dd5189b90d825509caa510f20d1d504761e78b8dfb95a0ede180f71d50e5"
],
"index": "ia",
- "version": "==2.8.6"
+ "version": "==2.9.5"
},
"publicsuffix": {
"hashes": [
@@ -567,86 +708,171 @@
},
"pydantic": {
"hashes": [
- "sha256:025bf13ce27990acc059d0c5be46f416fc9b293f45363b3d19855165fee1874f",
- "sha256:185e18134bec5ef43351149fe34fda4758e53d05bb8ea4d5928f0720997b79ef",
- "sha256:213125b7e9e64713d16d988d10997dabc6a1f73f3991e1ff8e35ebb1409c7dc9",
- "sha256:24ca47365be2a5a3cc3f4a26dcc755bcdc9f0036f55dcedbd55663662ba145ec",
- "sha256:38be427ea01a78206bcaf9a56f835784afcba9e5b88fbdce33bbbfbcd7841229",
- "sha256:475f2fa134cf272d6631072554f845d0630907fce053926ff634cc6bc45bf1af",
- "sha256:514b473d264671a5c672dfb28bdfe1bf1afd390f6b206aa2ec9fed7fc592c48e",
- "sha256:59e45f3b694b05a69032a0d603c32d453a23f0de80844fb14d55ab0c6c78ff2f",
- "sha256:5b24e8a572e4b4c18f614004dda8c9f2c07328cb5b6e314d6e1bbd536cb1a6c1",
- "sha256:6e3874aa7e8babd37b40c4504e3a94cc2023696ced5a0500949f3347664ff8e2",
- "sha256:8d72e814c7821125b16f1553124d12faba88e85405b0864328899aceaad7282b",
- "sha256:a4143c8d0c456a093387b96e0f5ee941a950992904d88bc816b4f0e72c9a0009",
- "sha256:b2b054d095b6431cdda2f852a6d2f0fdec77686b305c57961b4c5dd6d863bf3c",
- "sha256:c59ea046aea25be14dc22d69c97bee629e6d48d2b2ecb724d7fe8806bf5f61cd",
- "sha256:d1fe3f0df8ac0f3a9792666c69a7cd70530f329036426d06b4f899c025aca74e",
- "sha256:d8df4b9090b595511906fa48deda47af04e7d092318bfb291f4d45dfb6bb2127",
- "sha256:dba5c1f0a3aeea5083e75db9660935da90216f8a81b6d68e67f54e135ed5eb23",
- "sha256:e682f6442ebe4e50cb5e1cfde7dda6766fb586631c3e5569f6aa1951fd1a76ef",
- "sha256:ecb54491f98544c12c66ff3d15e701612fc388161fd455242447083350904730",
- "sha256:f5b06f5099e163295b8ff5b1b71132ecf5866cc6e7f586d78d7d3fd6e8084608",
- "sha256:f6864844b039805add62ebe8a8c676286340ba0c6d043ae5dea24114b82a319e",
- "sha256:ffd180ebd5dd2a9ac0da4e8b995c9c99e7c74c31f985ba090ee01d681b1c4b95"
+ "sha256:05e00dbebbe810b33c7a7362f231893183bcc4251f3f2ff991c31d5c08240c42",
+ "sha256:06094d18dd5e6f2bbf93efa54991c3240964bb663b87729ac340eb5014310624",
+ "sha256:0b959f4d8211fc964772b595ebb25f7652da3f22322c007b6fed26846a40685e",
+ "sha256:19b3b9ccf97af2b7519c42032441a891a5e05c68368f40865a90eb88833c2559",
+ "sha256:1b6ee725bd6e83ec78b1aa32c5b1fa67a3a65badddde3976bca5fe4568f27709",
+ "sha256:1ee433e274268a4b0c8fde7ad9d58ecba12b069a033ecc4645bb6303c062d2e9",
+ "sha256:216f3bcbf19c726b1cc22b099dd409aa371f55c08800bcea4c44c8f74b73478d",
+ "sha256:2d0567e60eb01bccda3a4df01df677adf6b437958d35c12a3ac3e0f078b0ee52",
+ "sha256:2e05aed07fa02231dbf03d0adb1be1d79cabb09025dd45aa094aa8b4e7b9dcda",
+ "sha256:352aedb1d71b8b0736c6d56ad2bd34c6982720644b0624462059ab29bd6e5912",
+ "sha256:355639d9afc76bcb9b0c3000ddcd08472ae75318a6eb67a15866b87e2efa168c",
+ "sha256:37c90345ec7dd2f1bcef82ce49b6235b40f282b94d3eec47e801baf864d15525",
+ "sha256:4b8795290deaae348c4eba0cebb196e1c6b98bdbe7f50b2d0d9a4a99716342fe",
+ "sha256:5760e164b807a48a8f25f8aa1a6d857e6ce62e7ec83ea5d5c5a802eac81bad41",
+ "sha256:6eb843dcc411b6a2237a694f5e1d649fc66c6064d02b204a7e9d194dff81eb4b",
+ "sha256:7b5ba54d026c2bd2cb769d3468885f23f43710f651688e91f5fb1edcf0ee9283",
+ "sha256:7c2abc4393dea97a4ccbb4ec7d8658d4e22c4765b7b9b9445588f16c71ad9965",
+ "sha256:81a7b66c3f499108b448f3f004801fcd7d7165fb4200acb03f1c2402da73ce4c",
+ "sha256:91b8e218852ef6007c2b98cd861601c6a09f1aa32bbbb74fab5b1c33d4a1e410",
+ "sha256:9300fcbebf85f6339a02c6994b2eb3ff1b9c8c14f502058b5bf349d42447dcf5",
+ "sha256:9cabf4a7f05a776e7793e72793cd92cc865ea0e83a819f9ae4ecccb1b8aa6116",
+ "sha256:a1f5a63a6dfe19d719b1b6e6106561869d2efaca6167f84f5ab9347887d78b98",
+ "sha256:a4c805731c33a8db4b6ace45ce440c4ef5336e712508b4d9e1aafa617dc9907f",
+ "sha256:ae544c47bec47a86bc7d350f965d8b15540e27e5aa4f55170ac6a75e5f73b644",
+ "sha256:b97890e56a694486f772d36efd2ba31612739bc6f3caeee50e9e7e3ebd2fdd13",
+ "sha256:bb6ad4489af1bac6955d38ebcb95079a836af31e4c4f74aba1ca05bb9f6027bd",
+ "sha256:bedf309630209e78582ffacda64a21f96f3ed2e51fbf3962d4d488e503420254",
+ "sha256:c1ba1afb396148bbc70e9eaa8c06c1716fdddabaf86e7027c5988bae2a829ab6",
+ "sha256:c33602f93bfb67779f9c507e4d69451664524389546bacfe1bee13cae6dc7488",
+ "sha256:c4aac8e7103bf598373208f6299fa9a5cfd1fc571f2d40bf1dd1955a63d6eeb5",
+ "sha256:c6f981882aea41e021f72779ce2a4e87267458cc4d39ea990729e21ef18f0f8c",
+ "sha256:cc78cc83110d2f275ec1970e7a831f4e371ee92405332ebfe9860a715f8336e1",
+ "sha256:d49f3db871575e0426b12e2f32fdb25e579dea16486a26e5a0474af87cb1ab0a",
+ "sha256:dd3f9a40c16daf323cf913593083698caee97df2804aa36c4b3175d5ac1b92a2",
+ "sha256:e0bedafe4bc165ad0a56ac0bd7695df25c50f76961da29c050712596cf092d6d",
+ "sha256:e9069e1b01525a96e6ff49e25876d90d5a563bc31c658289a8772ae186552236"
],
"index": "ia",
- "version": "==1.7.3"
+ "version": "==1.10.2"
},
"pylru": {
"hashes": [
- "sha256:492f934bb98dc6c8b2370c02c95c65516ddc08c8f64d27f70087eb038621d297"
+ "sha256:47ad140a63ab9389648dadfbb4330700e0ffeeb28ec04664ee47d37ed133b0f4",
+ "sha256:b7c75b0676e2fbae647823bc209e23998772867d3679f1583c7350a9b02a59f0"
+ ],
+ "version": "==1.2.1"
+ },
+ "pymupdf": {
+ "hashes": [
+ "sha256:05c54acf69ee55ef97453f9c52982ef2839c188fe464d6b4cdc053bd4c6298f1",
+ "sha256:11b913664c059146e512e8559ebd9f976570ef21c0338c953836bc02051c1d7e",
+ "sha256:13ed689e5ad4c3adecb7586050de8baaa1819f48e2c57ca4e87f80e3b2727cb3",
+ "sha256:164dc67f1f5db3b22207b2aeba0fadff0503123c8f31c46768b7da7d3595a181",
+ "sha256:1e7b85e2611a9cca7a410e4c5a510a11131de7c5da9379e46615a8d3adfa6df5",
+ "sha256:38188f88a6e648b9f3a87d29de5b4ed52f910827a15859b183f1321c68e6ac00",
+ "sha256:39192c009afd8dd877a79ed02519ec8d17699bec9e9543115e490f06a553e200",
+ "sha256:4c5e7211b85e13050ac6e25879d4f0476b7a04f23bd3b6442489cec9f8da8418",
+ "sha256:7281324a0325dd30c033644cc8654167dcbfe47c4b1d49805d407fa5a64ce76b",
+ "sha256:909fb46900e7422515291761a1294902cf163226ec8918ea4c3454537336dfeb",
+ "sha256:945529b7868f9fe290b11dfbc37e2b9012610fac9763686ccf91a4d968305c5e",
+ "sha256:976fb0e93f025617890f8f8d8517371684131aa0e9fc0c1d0b4cd8bd564cce27",
+ "sha256:9998f7dfa0f99d6c2c3eb0dcfbfd44433247c23c4b781bc45f76dab421bc554b",
+ "sha256:a3b8e5c2de6192c89f379283aa07aa7fd044098dab43a8cd3ac172e961caf286",
+ "sha256:b0db8c81b6c781e373ed005f7595e49b760f91edb3b36d1dc69ec29b4fad34f8",
+ "sha256:c03004415a6d140b2c4bb494bb507c9ccbd55d713407e3b5bc1dd35fa45f2be0",
+ "sha256:cfd6c666b02a066e9e76d9ce8ca5e7fa4f2bf7a8ce6934cd2837b08509d46f8e",
+ "sha256:dffe67c5574d0ebb1e39b5ecf806fb4fd4ddb01bee5630f516ece4468252c9f0",
+ "sha256:ef3d13e27f1585d776f6a2597f113aabd28d36b648b983a72850b21c5399ab08",
+ "sha256:f04086036d40af50e5d6f54e949fa12eacda2d752562a2f85215763b137bf864",
+ "sha256:f3f96bd465e9e0e2960bb70e92233af0865181b9dd8ac5bc6b159d79584df2fe"
],
- "version": "==1.2.0"
- },
- "pymysql": {
- "hashes": [
- "sha256:263040d2779a3b84930f7ac9da5132be0fefcd6f453a885756656103f8ee1fdd",
- "sha256:44f47128dda8676e021c8d2dbb49a82be9e4ab158b9f03e897152a3a287c69ea"
- ],
- "version": "==0.10.1"
+ "index": "ia",
+ "version": "==1.19.6"
},
"python-dateutil": {
"hashes": [
- "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
- "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
+ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
+ "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
- "version": "==2.8.1"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==2.8.2"
},
"python-magic": {
"hashes": [
- "sha256:356efa93c8899047d1eb7d3eb91e871ba2f5b1376edbaf4cc305e3c872207355",
- "sha256:b757db2a5289ea3f1ced9e60f072965243ea43a2221430048fd8cacab17be0ce"
+ "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b",
+ "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"
],
"index": "ia",
- "version": "==0.4.18"
+ "version": "==0.4.27"
},
"python-poppler": {
"hashes": [
- "sha256:6843398adc9c290035646c4cf3c7bfcea9c8e04390bb9cd8fdc9bd063fb77880"
+ "sha256:8b6a157e51cbb4c08353a21ca3f6f396558759cdfb0b80071379ad89d5f7c533"
],
"index": "ia",
- "version": "==0.2.2"
+ "version": "==0.3.0"
},
"python-snappy": {
"hashes": [
- "sha256:9c0ba725755b749ef9b03f6ed7582cefb957c0d9f6f064a7c4314148a9dbdb61",
- "sha256:a745b3732750e2e627adf45fe2669b18afb4170431b0d100da041f807bdea0c8",
- "sha256:ac48ec6146d71627bba0fe4857984ac1f3f70a35c12eed0f91b46f353952d5fa",
- "sha256:b08db966a9c041220b1b602a2e36498dc0755b46b0d8b119f568de71804b9aed",
- "sha256:d9c26532cfa510f45e8d135cde140e8a5603d3fb254cfec273ebc0ecf9f668e2",
- "sha256:f21e8472a7f11b65b4bb5aea1c12624e2d4199aa586c57a11faa0de86a3053a6",
- "sha256:f8bbf1e04d0ec722a7f2e16f2c179f5ada4cfc0ac1196703225894303b061dbb"
+ "sha256:03bb511380fca2a13325b6f16fe8234c8e12da9660f0258cd45d9a02ffc916af",
+ "sha256:0bdb6942180660bda7f7d01f4c0def3cfc72b1c6d99aad964801775a3e379aba",
+ "sha256:0d489b50f49433494160c45048fe806de6b3aeab0586e497ebd22a0bab56e427",
+ "sha256:1a993dc8aadd901915a510fe6af5f20ae4256f527040066c22a154db8946751f",
+ "sha256:1d029f7051ec1bbeaa3e03030b6d8ed47ceb69cae9016f493c802a08af54e026",
+ "sha256:277757d5dad4e239dc1417438a0871b65b1b155beb108888e7438c27ffc6a8cc",
+ "sha256:2a7e528ab6e09c0d67dcb61a1730a292683e5ff9bb088950638d3170cf2a0a54",
+ "sha256:2aaaf618c68d8c9daebc23a20436bd01b09ee70d7fbf7072b7f38b06d2fab539",
+ "sha256:2be4f4550acd484912441f5f1209ba611ac399aac9355fee73611b9a0d4f949c",
+ "sha256:39692bedbe0b717001a99915ac0eb2d9d0bad546440d392a2042b96d813eede1",
+ "sha256:3fb9a88a4dd6336488f3de67ce75816d0d796dce53c2c6e4d70e0b565633c7fd",
+ "sha256:4038019b1bcaadde726a57430718394076c5a21545ebc5badad2c045a09546cf",
+ "sha256:463fd340a499d47b26ca42d2f36a639188738f6e2098c6dbf80aef0e60f461e1",
+ "sha256:4d3cafdf454354a621c8ab7408e45aa4e9d5c0b943b61ff4815f71ca6bdf0130",
+ "sha256:4ec533a8c1f8df797bded662ec3e494d225b37855bb63eb0d75464a07947477c",
+ "sha256:530bfb9efebcc1aab8bb4ebcbd92b54477eed11f6cf499355e882970a6d3aa7d",
+ "sha256:546c1a7470ecbf6239101e9aff0f709b68ca0f0268b34d9023019a55baa1f7c6",
+ "sha256:5843feb914796b1f0405ccf31ea0fb51034ceb65a7588edfd5a8250cb369e3b2",
+ "sha256:586724a0276d7a6083a17259d0b51622e492289a9998848a1b01b6441ca12b2f",
+ "sha256:59e975be4206cc54d0a112ef72fa3970a57c2b1bcc2c97ed41d6df0ebe518228",
+ "sha256:5a453c45178d7864c1bdd6bfe0ee3ed2883f63b9ba2c9bb967c6b586bf763f96",
+ "sha256:5bb05c28298803a74add08ba496879242ef159c75bc86a5406fac0ffc7dd021b",
+ "sha256:5e973e637112391f05581f427659c05b30b6843bc522a65be35ac7b18ce3dedd",
+ "sha256:66c80e9b366012dbee262bb1869e4fc5ba8786cda85928481528bc4a72ec2ee8",
+ "sha256:6a7620404da966f637b9ce8d4d3d543d363223f7a12452a575189c5355fc2d25",
+ "sha256:6f8bf4708a11b47517baf962f9a02196478bbb10fdb9582add4aa1459fa82380",
+ "sha256:735cd4528c55dbe4516d6d2b403331a99fc304f8feded8ae887cf97b67d589bb",
+ "sha256:7778c224efc38a40d274da4eb82a04cac27aae20012372a7db3c4bbd8926c4d4",
+ "sha256:8277d1f6282463c40761f802b742f833f9f2449fcdbb20a96579aa05c8feb614",
+ "sha256:88b6ea78b83d2796f330b0af1b70cdd3965dbdab02d8ac293260ec2c8fe340ee",
+ "sha256:8c07220408d3268e8268c9351c5c08041bc6f8c6172e59d398b71020df108541",
+ "sha256:8d0c019ee7dcf2c60e240877107cddbd95a5b1081787579bf179938392d66480",
+ "sha256:90b0186516b7a101c14764b0c25931b741fb0102f21253eff67847b4742dfc72",
+ "sha256:9837ac1650cc68d22a3cf5f15fb62c6964747d16cecc8b22431f113d6e39555d",
+ "sha256:9eac51307c6a1a38d5f86ebabc26a889fddf20cbba7a116ccb54ba1446601d5b",
+ "sha256:9f0c0d88b84259f93c3aa46398680646f2c23e43394779758d9f739c34e15295",
+ "sha256:a0ad38bc98d0b0497a0b0dbc29409bcabfcecff4511ed7063403c86de16927bc",
+ "sha256:b265cde49774752aec9ca7f5d272e3f98718164afc85521622a8a5394158a2b5",
+ "sha256:b6a107ab06206acc5359d4c5632bd9b22d448702a79b3169b0c62e0fb808bb2a",
+ "sha256:b7f920eaf46ebf41bd26f9df51c160d40f9e00b7b48471c3438cb8d027f7fb9b",
+ "sha256:c20498bd712b6e31a4402e1d027a1cd64f6a4a0066a3fe3c7344475886d07fdf",
+ "sha256:cb18d9cd7b3f35a2f5af47bb8ed6a5bdbf4f3ddee37f3daade4ab7864c292f5b",
+ "sha256:cf5bb9254e1c38aacf253d510d3d9be631bba21f3d068b17672b38b5cbf2fff5",
+ "sha256:d017775851a778ec9cc32651c4464079d06d927303c2dde9ae9830ccf6fe94e1",
+ "sha256:dc96668d9c7cc656609764275c5f8da58ef56d89bdd6810f6923d36497468ff7",
+ "sha256:e066a0586833d610c4bbddba0be5ba0e3e4f8e0bc5bb6d82103d8f8fc47bb59a",
+ "sha256:e3a013895c64352b49d0d8e107a84f99631b16dbab156ded33ebf0becf56c8b2",
+ "sha256:eaf905a580f2747c4a474040a5063cd5e0cc3d1d2d6edb65f28196186493ad4a"
],
"index": "ia",
- "version": "==0.5.4"
+ "version": "==0.6.1"
},
"pytz": {
"hashes": [
- "sha256:16962c5fb8db4a8f63a26646d8886e9d769b6c511543557bc84e9569fb9a9cb4",
- "sha256:180befebb1927b16f6b57101720075a984c019ac16b1b7575673bea42c6c3da5"
+ "sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a",
+ "sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd"
+ ],
+ "version": "==2022.7"
+ },
+ "pytz-deprecation-shim": {
+ "hashes": [
+ "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
+ "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
- "version": "==2020.5"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==0.1.0.post0"
},
"pyyaml": {
"hashes": [
@@ -666,84 +892,210 @@
],
"version": "==5.3.1"
},
- "raven": {
- "extras": [
- "flask"
- ],
- "hashes": [
- "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
- "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
- ],
- "index": "ia",
- "version": "==6.10.0"
- },
- "readability-lxml": {
- "hashes": [
- "sha256:e0d366a21b1bd6cca17de71a4e6ea16fcfaa8b0a5b4004e39e2c7eff884e6305",
- "sha256:e51fea56b5909aaf886d307d48e79e096293255afa567b7d08bca94d25b1a4e1"
- ],
- "version": "==0.8.1"
+ "rapidfuzz": {
+ "hashes": [
+ "sha256:020858dd89b60ce38811cd6e37875c4c3c8d7fcd8bc20a0ad2ed1f464b34dc4e",
+ "sha256:042644133244bfa7b20de635d500eb9f46af7097f3d90b1724f94866f17cb55e",
+ "sha256:08590905a95ccfa43f4df353dcc5d28c15d70664299c64abcad8721d89adce4f",
+ "sha256:114810491efb25464016fd554fdf1e20d390309cecef62587494fc474d4b926f",
+ "sha256:1333fb3d603d6b1040e365dca4892ba72c7e896df77a54eae27dc07db90906e3",
+ "sha256:16080c05a63d6042643ae9b6cfec1aefd3e61cef53d0abe0df3069b9d4b72077",
+ "sha256:16ffad751f43ab61001187b3fb4a9447ec2d1aedeff7c5bac86d3b95f9980cc3",
+ "sha256:1f50d1227e6e2a0e3ae1fb1c9a2e1c59577d3051af72c7cab2bcc430cb5e18da",
+ "sha256:1fbad8fb28d98980f5bff33c7842efef0315d42f0cd59082108482a7e6b61410",
+ "sha256:23524635840500ce6f4d25005c9529a97621689c85d2f727c52eed1782839a6a",
+ "sha256:24d3fea10680d085fd0a4d76e581bfb2b1074e66e78fd5964d4559e1fcd2a2d4",
+ "sha256:24eb6b843492bdc63c79ee4b2f104059b7a2201fef17f25177f585d3be03405a",
+ "sha256:25b4cedf2aa19fb7212894ce5f5219010cce611b60350e9a0a4d492122e7b351",
+ "sha256:27be9c63215d302ede7d654142a2e21f0d34ea6acba512a4ae4cfd52bbaa5b59",
+ "sha256:2c836f0f2d33d4614c3fbaf9a1eb5407c0fe23f8876f47fd15b90f78daa64c34",
+ "sha256:3a9bd02e1679c0fd2ecf69b72d0652dbe2a9844eaf04a36ddf4adfbd70010e95",
+ "sha256:3d8b081988d0a49c486e4e845a547565fee7c6e7ad8be57ff29c3d7c14c6894c",
+ "sha256:3dcffe1f3cbda0dc32133a2ae2255526561ca594f15f9644384549037b355245",
+ "sha256:3f11a7eff7bc6301cd6a5d43f309e22a815af07e1f08eeb2182892fca04c86cb",
+ "sha256:42085d4b154a8232767de8296ac39c8af5bccee6b823b0507de35f51c9cbc2d7",
+ "sha256:424f82c35dbe4f83bdc3b490d7d696a1dc6423b3d911460f5493b7ffae999fd2",
+ "sha256:43fb8cb030f888c3f076d40d428ed5eb4331f5dd6cf1796cfa39c67bf0f0fc1e",
+ "sha256:460853983ab88f873173e27cc601c5276d469388e6ad6e08c4fd57b2a86f1064",
+ "sha256:467c1505362823a5af12b10234cb1c4771ccf124c00e3fc9a43696512bd52293",
+ "sha256:46b9b8aa09998bc48dd800854e8d9b74bc534d7922c1d6e1bbf783e7fa6ac29c",
+ "sha256:53dcae85956853b787c27c1cb06f18bb450e22cf57a4ad3444cf03b8ff31724a",
+ "sha256:585206112c294e335d84de5d5f179c0f932837752d7420e3de21db7fdc476278",
+ "sha256:5ada0a14c67452358c1ee52ad14b80517a87b944897aaec3e875279371a9cb96",
+ "sha256:5e2b3d020219baa75f82a4e24b7c8adcb598c62f0e54e763c39361a9e5bad510",
+ "sha256:6120f2995f5154057454c5de99d86b4ef3b38397899b5da1265467e8980b2f60",
+ "sha256:68a89bb06d5a331511961f4d3fa7606f8e21237467ba9997cae6f67a1c2c2b9e",
+ "sha256:7496e8779905b02abc0ab4ba2a848e802ab99a6e20756ffc967a0de4900bd3da",
+ "sha256:759a3361711586a29bc753d3d1bdb862983bd9b9f37fbd7f6216c24f7c972554",
+ "sha256:75c45dcd595f8178412367e302fd022860ea025dc4a78b197b35428081ed33d5",
+ "sha256:7d005e058d86f2a968a8d28ca6f2052fab1f124a39035aa0523261d6baf21e1f",
+ "sha256:7f7930adf84301797c3f09c94b9c5a9ed90a9e8b8ed19b41d2384937e0f9f5bd",
+ "sha256:8109e0324d21993d5b2d111742bf5958f3516bf8c59f297c5d1cc25a2342eb66",
+ "sha256:81642a24798851b118f82884205fc1bd9ff70b655c04018c467824b6ecc1fabc",
+ "sha256:8450d15f7765482e86ef9be2ad1a05683cd826f59ad236ef7b9fb606464a56aa",
+ "sha256:875d51b3497439a72e2d76183e1cb5468f3f979ab2ddfc1d1f7dde3b1ecfb42f",
+ "sha256:8b477b43ced896301665183a5e0faec0f5aea2373005648da8bdcb3c4b73f280",
+ "sha256:8d3e252d4127c79b4d7c2ae47271636cbaca905c8bb46d80c7930ab906cf4b5c",
+ "sha256:916bc2e6cf492c77ad6deb7bcd088f0ce9c607aaeabc543edeb703e1fbc43e31",
+ "sha256:988f8f6abfba7ee79449f8b50687c174733b079521c3cc121d65ad2d38831846",
+ "sha256:99a84ab9ac9a823e7e93b4414f86344052a5f3e23b23aa365cda01393ad895bd",
+ "sha256:9be02162af0376d64b840f2fc8ee3366794fc149f1e06d095a6a1d42447d97c5",
+ "sha256:a5585189b3d90d81ccd62d4f18530d5ac8972021f0aaaa1ffc6af387ff1dce75",
+ "sha256:ae33a72336059213996fe4baca4e0e4860913905c2efb7c991eab33b95a98a0a",
+ "sha256:af4f7c3c904ca709493eb66ca9080b44190c38e9ecb3b48b96d38825d5672559",
+ "sha256:b20141fa6cee041917801de0bab503447196d372d4c7ee9a03721b0a8edf5337",
+ "sha256:b3210869161a864f3831635bb13d24f4708c0aa7208ef5baac1ac4d46e9b4208",
+ "sha256:b34e8c0e492949ecdd5da46a1cfc856a342e2f0389b379b1a45a3cdcd3176a6e",
+ "sha256:b52ac2626945cd21a2487aeefed794c14ee31514c8ae69b7599170418211e6f6",
+ "sha256:b5dd713a1734574c2850c566ac4286594bacbc2d60b9170b795bee4b68656625",
+ "sha256:b5f705652360d520c2de52bee11100c92f59b3e3daca308ebb150cbc58aecdad",
+ "sha256:b6389c50d8d214c9cd11a77f6d501529cb23279a9c9cafe519a3a4b503b5f72a",
+ "sha256:b6bad92de071cbffa2acd4239c1779f66851b60ffbbda0e4f4e8a2e9b17e7eef",
+ "sha256:b75dd0928ce8e216f88660ab3d5c5ffe990f4dd682fd1709dba29d5dafdde6de",
+ "sha256:c2523f8180ebd9796c18d809e9a19075a1060b1a170fde3799e83db940c1b6d5",
+ "sha256:c31022d9970177f6affc6d5dd757ed22e44a10890212032fabab903fdee3bfe7",
+ "sha256:c36fd260084bb636b9400bb92016c6bd81fd80e59ed47f2466f85eda1fc9f782",
+ "sha256:c3741cb0bf9794783028e8b0cf23dab917fa5e37a6093b94c4c2f805f8e36b9f",
+ "sha256:c3fbe449d869ea4d0909fc9d862007fb39a584fb0b73349a6aab336f0d90eaed",
+ "sha256:c66546e30addb04a16cd864f10f5821272a1bfe6462ee5605613b4f1cb6f7b48",
+ "sha256:c71d9d512b76f05fa00282227c2ae884abb60e09f08b5ca3132b7e7431ac7f0d",
+ "sha256:c8601a66fbfc0052bb7860d2eacd303fcde3c14e87fdde409eceff516d659e77",
+ "sha256:c88adbcb933f6b8612f6c593384bf824e562bb35fc8a0f55fac690ab5b3486e5",
+ "sha256:ca00fafd2756bc9649bf80f1cf72c647dce38635f0695d7ce804bc0f759aa756",
+ "sha256:ca8a23097c1f50e0fdb4de9e427537ca122a18df2eead06ed39c3a0bef6d9d3a",
+ "sha256:cda1e2f66bb4ba7261a0f4c2d052d5d909798fca557cbff68f8a79a87d66a18f",
+ "sha256:cdfc04f7647c29fb48da7a04082c34cdb16f878d3c6d098d62d5715c0ad3000c",
+ "sha256:cf62dacb3f9234f3fddd74e178e6d25c68f2067fde765f1d95f87b1381248f58",
+ "sha256:d00df2e4a81ffa56a6b1ec4d2bc29afdcb7f565e0b8cd3092fece2290c4c7a79",
+ "sha256:d248a109699ce9992304e79c1f8735c82cc4c1386cd8e27027329c0549f248a2",
+ "sha256:d63def9bbc6b35aef4d76dc740301a4185867e8870cbb8719ec9de672212fca8",
+ "sha256:d82f20c0060ffdaadaf642b88ab0aa52365b56dffae812e188e5bdb998043588",
+ "sha256:dbcf5371ea704759fcce772c66a07647751d1f5dbdec7818331c9b31ae996c77",
+ "sha256:e8914dad106dacb0775718e54bf15e528055c4e92fb2677842996f2d52da5069",
+ "sha256:ebe303cd9839af69dd1f7942acaa80b1ba90bacef2e7ded9347fbed4f1654672",
+ "sha256:ec55a81ac2b0f41b8d6fb29aad16e55417036c7563bad5568686931aa4ff08f7",
+ "sha256:effe182767d102cb65dfbbf74192237dbd22d4191928d59415aa7d7c861d8c88",
+ "sha256:f42b82f268689f429def9ecfb86fa65ceea0eaf3fed408b570fe113311bf5ce7",
+ "sha256:f6fe570e20e293eb50491ae14ddeef71a6a7e5f59d7e791393ffa99b13f1f8c2",
+ "sha256:f799d1d6c33d81e983d3682571cc7d993ae7ff772c19b3aabb767039c33f6d1e",
+ "sha256:f891b98f8bc6c9d521785816085e9657212621e93f223917fb8e32f318b2957e",
+ "sha256:fa263135b892686e11d5b84f6a1892523123a00b7e5882eff4fbdabb38667347",
+ "sha256:fa4c598ed77f74ec973247ca776341200b0f93ec3883e34c222907ce72cb92a4",
+ "sha256:fe56659ccadbee97908132135de4b875543353351e0c92e736b7c57aee298b5a",
+ "sha256:fe59a0c21a032024edb0c8e43f5dee5623fef0b65a1e3c1281836d9ce199af3b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.13.7"
},
"redis": {
"hashes": [
- "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2",
- "sha256:432b788c4530cfe16d8d943a09d40ca6c16149727e4afe8c2c9d5580c59d9f24"
+ "sha256:7b8c87d19c45d3f1271b124858d2a5c13160c4e74d4835e28273400fa34d5228",
+ "sha256:cae3ee5d1f57d8caf534cd8764edf3163c77e073bdd74b6f54a87ffafdc5e7d9"
],
- "version": "==3.5.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"regex": {
"hashes": [
- "sha256:02951b7dacb123d8ea6da44fe45ddd084aa6777d4b2454fa0da61d569c6fa538",
- "sha256:0d08e71e70c0237883d0bef12cad5145b84c3705e9c6a588b2a9c7080e5af2a4",
- "sha256:1862a9d9194fae76a7aaf0150d5f2a8ec1da89e8b55890b1786b8f88a0f619dc",
- "sha256:1ab79fcb02b930de09c76d024d279686ec5d532eb814fd0ed1e0051eb8bd2daa",
- "sha256:1fa7ee9c2a0e30405e21031d07d7ba8617bc590d391adfc2b7f1e8b99f46f444",
- "sha256:262c6825b309e6485ec2493ffc7e62a13cf13fb2a8b6d212f72bd53ad34118f1",
- "sha256:2a11a3e90bd9901d70a5b31d7dd85114755a581a5da3fc996abfefa48aee78af",
- "sha256:2c99e97d388cd0a8d30f7c514d67887d8021541b875baf09791a3baad48bb4f8",
- "sha256:3128e30d83f2e70b0bed9b2a34e92707d0877e460b402faca908c6667092ada9",
- "sha256:38c8fd190db64f513fe4e1baa59fed086ae71fa45083b6936b52d34df8f86a88",
- "sha256:3bddc701bdd1efa0d5264d2649588cbfda549b2899dc8d50417e47a82e1387ba",
- "sha256:4902e6aa086cbb224241adbc2f06235927d5cdacffb2425c73e6570e8d862364",
- "sha256:49cae022fa13f09be91b2c880e58e14b6da5d10639ed45ca69b85faf039f7a4e",
- "sha256:56e01daca75eae420bce184edd8bb341c8eebb19dd3bce7266332258f9fb9dd7",
- "sha256:5862975b45d451b6db51c2e654990c1820523a5b07100fc6903e9c86575202a0",
- "sha256:6a8ce43923c518c24a2579fda49f093f1397dad5d18346211e46f134fc624e31",
- "sha256:6c54ce4b5d61a7129bad5c5dc279e222afd00e721bf92f9ef09e4fae28755683",
- "sha256:6e4b08c6f8daca7d8f07c8d24e4331ae7953333dbd09c648ed6ebd24db5a10ee",
- "sha256:717881211f46de3ab130b58ec0908267961fadc06e44f974466d1887f865bd5b",
- "sha256:749078d1eb89484db5f34b4012092ad14b327944ee7f1c4f74d6279a6e4d1884",
- "sha256:7913bd25f4ab274ba37bc97ad0e21c31004224ccb02765ad984eef43e04acc6c",
- "sha256:7a25fcbeae08f96a754b45bdc050e1fb94b95cab046bf56b016c25e9ab127b3e",
- "sha256:83d6b356e116ca119db8e7c6fc2983289d87b27b3fac238cfe5dca529d884562",
- "sha256:8b882a78c320478b12ff024e81dc7d43c1462aa4a3341c754ee65d857a521f85",
- "sha256:8f6a2229e8ad946e36815f2a03386bb8353d4bde368fdf8ca5f0cb97264d3b5c",
- "sha256:9801c4c1d9ae6a70aeb2128e5b4b68c45d4f0af0d1535500884d644fa9b768c6",
- "sha256:a15f64ae3a027b64496a71ab1f722355e570c3fac5ba2801cafce846bf5af01d",
- "sha256:a3d748383762e56337c39ab35c6ed4deb88df5326f97a38946ddd19028ecce6b",
- "sha256:a63f1a07932c9686d2d416fb295ec2c01ab246e89b4d58e5fa468089cab44b70",
- "sha256:b2b1a5ddae3677d89b686e5c625fc5547c6e492bd755b520de5332773a8af06b",
- "sha256:b2f4007bff007c96a173e24dcda236e5e83bde4358a557f9ccf5e014439eae4b",
- "sha256:baf378ba6151f6e272824b86a774326f692bc2ef4cc5ce8d5bc76e38c813a55f",
- "sha256:bafb01b4688833e099d79e7efd23f99172f501a15c44f21ea2118681473fdba0",
- "sha256:bba349276b126947b014e50ab3316c027cac1495992f10e5682dc677b3dfa0c5",
- "sha256:c084582d4215593f2f1d28b65d2a2f3aceff8342aa85afd7be23a9cad74a0de5",
- "sha256:d1ebb090a426db66dd80df8ca85adc4abfcbad8a7c2e9a5ec7513ede522e0a8f",
- "sha256:d2d8ce12b7c12c87e41123997ebaf1a5767a5be3ec545f64675388970f415e2e",
- "sha256:e32f5f3d1b1c663af7f9c4c1e72e6ffe9a78c03a31e149259f531e0fed826512",
- "sha256:e3faaf10a0d1e8e23a9b51d1900b72e1635c2d5b0e1bea1c18022486a8e2e52d",
- "sha256:f7d29a6fc4760300f86ae329e3b6ca28ea9c20823df123a2ea8693e967b29917",
- "sha256:f8f295db00ef5f8bae530fc39af0b40486ca6068733fb860b42115052206466f"
- ],
- "version": "==2020.11.13"
+ "sha256:052b670fafbe30966bbe5d025e90b2a491f85dfe5b2583a163b5e60a85a321ad",
+ "sha256:0653d012b3bf45f194e5e6a41df9258811ac8fc395579fa82958a8b76286bea4",
+ "sha256:0a069c8483466806ab94ea9068c34b200b8bfc66b6762f45a831c4baaa9e8cdd",
+ "sha256:0cf0da36a212978be2c2e2e2d04bdff46f850108fccc1851332bcae51c8907cc",
+ "sha256:131d4be09bea7ce2577f9623e415cab287a3c8e0624f778c1d955ec7c281bd4d",
+ "sha256:144486e029793a733e43b2e37df16a16df4ceb62102636ff3db6033994711066",
+ "sha256:1ddf14031a3882f684b8642cb74eea3af93a2be68893901b2b387c5fd92a03ec",
+ "sha256:1eba476b1b242620c266edf6325b443a2e22b633217a9835a52d8da2b5c051f9",
+ "sha256:20f61c9944f0be2dc2b75689ba409938c14876c19d02f7585af4460b6a21403e",
+ "sha256:22960019a842777a9fa5134c2364efaed5fbf9610ddc5c904bd3a400973b0eb8",
+ "sha256:22e7ebc231d28393dfdc19b185d97e14a0f178bedd78e85aad660e93b646604e",
+ "sha256:23cbb932cc53a86ebde0fb72e7e645f9a5eec1a5af7aa9ce333e46286caef783",
+ "sha256:29c04741b9ae13d1e94cf93fca257730b97ce6ea64cfe1eba11cf9ac4e85afb6",
+ "sha256:2bde29cc44fa81c0a0c8686992c3080b37c488df167a371500b2a43ce9f026d1",
+ "sha256:2cdc55ca07b4e70dda898d2ab7150ecf17c990076d3acd7a5f3b25cb23a69f1c",
+ "sha256:370f6e97d02bf2dd20d7468ce4f38e173a124e769762d00beadec3bc2f4b3bc4",
+ "sha256:395161bbdbd04a8333b9ff9763a05e9ceb4fe210e3c7690f5e68cedd3d65d8e1",
+ "sha256:44136355e2f5e06bf6b23d337a75386371ba742ffa771440b85bed367c1318d1",
+ "sha256:44a6c2f6374e0033873e9ed577a54a3602b4f609867794c1a3ebba65e4c93ee7",
+ "sha256:4919899577ba37f505aaebdf6e7dc812d55e8f097331312db7f1aab18767cce8",
+ "sha256:4b4b1fe58cd102d75ef0552cf17242705ce0759f9695334a56644ad2d83903fe",
+ "sha256:4bdd56ee719a8f751cf5a593476a441c4e56c9b64dc1f0f30902858c4ef8771d",
+ "sha256:4bf41b8b0a80708f7e0384519795e80dcb44d7199a35d52c15cc674d10b3081b",
+ "sha256:4cac3405d8dda8bc6ed499557625585544dd5cbf32072dcc72b5a176cb1271c8",
+ "sha256:4fe7fda2fe7c8890d454f2cbc91d6c01baf206fbc96d89a80241a02985118c0c",
+ "sha256:50921c140561d3db2ab9f5b11c5184846cde686bb5a9dc64cae442926e86f3af",
+ "sha256:5217c25229b6a85049416a5c1e6451e9060a1edcf988641e309dbe3ab26d3e49",
+ "sha256:5352bea8a8f84b89d45ccc503f390a6be77917932b1c98c4cdc3565137acc714",
+ "sha256:542e3e306d1669b25936b64917285cdffcd4f5c6f0247636fec037187bd93542",
+ "sha256:543883e3496c8b6d58bd036c99486c3c8387c2fc01f7a342b760c1ea3158a318",
+ "sha256:586b36ebda81e6c1a9c5a5d0bfdc236399ba6595e1397842fd4a45648c30f35e",
+ "sha256:597f899f4ed42a38df7b0e46714880fb4e19a25c2f66e5c908805466721760f5",
+ "sha256:5a260758454580f11dd8743fa98319bb046037dfab4f7828008909d0aa5292bc",
+ "sha256:5aefb84a301327ad115e9d346c8e2760009131d9d4b4c6b213648d02e2abe144",
+ "sha256:5e6a5567078b3eaed93558842346c9d678e116ab0135e22eb72db8325e90b453",
+ "sha256:5ff525698de226c0ca743bfa71fc6b378cda2ddcf0d22d7c37b1cc925c9650a5",
+ "sha256:61edbca89aa3f5ef7ecac8c23d975fe7261c12665f1d90a6b1af527bba86ce61",
+ "sha256:659175b2144d199560d99a8d13b2228b85e6019b6e09e556209dfb8c37b78a11",
+ "sha256:6a9a19bea8495bb419dc5d38c4519567781cd8d571c72efc6aa959473d10221a",
+ "sha256:6b30bddd61d2a3261f025ad0f9ee2586988c6a00c780a2fb0a92cea2aa702c54",
+ "sha256:6ffd55b5aedc6f25fd8d9f905c9376ca44fcf768673ffb9d160dd6f409bfda73",
+ "sha256:702d8fc6f25bbf412ee706bd73019da5e44a8400861dfff7ff31eb5b4a1276dc",
+ "sha256:74bcab50a13960f2a610cdcd066e25f1fd59e23b69637c92ad470784a51b1347",
+ "sha256:75f591b2055523fc02a4bbe598aa867df9e953255f0b7f7715d2a36a9c30065c",
+ "sha256:763b64853b0a8f4f9cfb41a76a4a85a9bcda7fdda5cb057016e7706fde928e66",
+ "sha256:76c598ca73ec73a2f568e2a72ba46c3b6c8690ad9a07092b18e48ceb936e9f0c",
+ "sha256:78d680ef3e4d405f36f0d6d1ea54e740366f061645930072d39bca16a10d8c93",
+ "sha256:7b280948d00bd3973c1998f92e22aa3ecb76682e3a4255f33e1020bd32adf443",
+ "sha256:7db345956ecce0c99b97b042b4ca7326feeec6b75facd8390af73b18e2650ffc",
+ "sha256:7dbdce0c534bbf52274b94768b3498abdf675a691fec5f751b6057b3030f34c1",
+ "sha256:7ef6b5942e6bfc5706301a18a62300c60db9af7f6368042227ccb7eeb22d0892",
+ "sha256:7f5a3ffc731494f1a57bd91c47dc483a1e10048131ffb52d901bfe2beb6102e8",
+ "sha256:8a45b6514861916c429e6059a55cf7db74670eaed2052a648e3e4d04f070e001",
+ "sha256:8ad241da7fac963d7573cc67a064c57c58766b62a9a20c452ca1f21050868dfa",
+ "sha256:8b0886885f7323beea6f552c28bff62cbe0983b9fbb94126531693ea6c5ebb90",
+ "sha256:8ca88da1bd78990b536c4a7765f719803eb4f8f9971cc22d6ca965c10a7f2c4c",
+ "sha256:8e0caeff18b96ea90fc0eb6e3bdb2b10ab5b01a95128dfeccb64a7238decf5f0",
+ "sha256:957403a978e10fb3ca42572a23e6f7badff39aa1ce2f4ade68ee452dc6807692",
+ "sha256:9af69f6746120998cd9c355e9c3c6aec7dff70d47247188feb4f829502be8ab4",
+ "sha256:9c94f7cc91ab16b36ba5ce476f1904c91d6c92441f01cd61a8e2729442d6fcf5",
+ "sha256:a37d51fa9a00d265cf73f3de3930fa9c41548177ba4f0faf76e61d512c774690",
+ "sha256:a3a98921da9a1bf8457aeee6a551948a83601689e5ecdd736894ea9bbec77e83",
+ "sha256:a3c1ebd4ed8e76e886507c9eddb1a891673686c813adf889b864a17fafcf6d66",
+ "sha256:a5f9505efd574d1e5b4a76ac9dd92a12acb2b309551e9aa874c13c11caefbe4f",
+ "sha256:a8ff454ef0bb061e37df03557afda9d785c905dab15584860f982e88be73015f",
+ "sha256:a9d0b68ac1743964755ae2d89772c7e6fb0118acd4d0b7464eaf3921c6b49dd4",
+ "sha256:aa62a07ac93b7cb6b7d0389d8ef57ffc321d78f60c037b19dfa78d6b17c928ee",
+ "sha256:ac741bf78b9bb432e2d314439275235f41656e189856b11fb4e774d9f7246d81",
+ "sha256:ae1e96785696b543394a4e3f15f3f225d44f3c55dafe3f206493031419fedf95",
+ "sha256:b683e5fd7f74fb66e89a1ed16076dbab3f8e9f34c18b1979ded614fe10cdc4d9",
+ "sha256:b7a8b43ee64ca8f4befa2bea4083f7c52c92864d8518244bfa6e88c751fa8fff",
+ "sha256:b8e38472739028e5f2c3a4aded0ab7eadc447f0d84f310c7a8bb697ec417229e",
+ "sha256:bfff48c7bd23c6e2aec6454aaf6edc44444b229e94743b34bdcdda2e35126cf5",
+ "sha256:c14b63c9d7bab795d17392c7c1f9aaabbffd4cf4387725a0ac69109fb3b550c6",
+ "sha256:c27cc1e4b197092e50ddbf0118c788d9977f3f8f35bfbbd3e76c1846a3443df7",
+ "sha256:c28d3309ebd6d6b2cf82969b5179bed5fefe6142c70f354ece94324fa11bf6a1",
+ "sha256:c670f4773f2f6f1957ff8a3962c7dd12e4be54d05839b216cb7fd70b5a1df394",
+ "sha256:ce6910b56b700bea7be82c54ddf2e0ed792a577dfaa4a76b9af07d550af435c6",
+ "sha256:d0213671691e341f6849bf33cd9fad21f7b1cb88b89e024f33370733fec58742",
+ "sha256:d03fe67b2325cb3f09be029fd5da8df9e6974f0cde2c2ac6a79d2634e791dd57",
+ "sha256:d0e5af9a9effb88535a472e19169e09ce750c3d442fb222254a276d77808620b",
+ "sha256:d243b36fbf3d73c25e48014961e83c19c9cc92530516ce3c43050ea6276a2ab7",
+ "sha256:d26166acf62f731f50bdd885b04b38828436d74e8e362bfcb8df221d868b5d9b",
+ "sha256:d403d781b0e06d2922435ce3b8d2376579f0c217ae491e273bab8d092727d244",
+ "sha256:d8716f82502997b3d0895d1c64c3b834181b1eaca28f3f6336a71777e437c2af",
+ "sha256:e4f781ffedd17b0b834c8731b75cce2639d5a8afe961c1e58ee7f1f20b3af185",
+ "sha256:e613a98ead2005c4ce037c7b061f2409a1a4e45099edb0ef3200ee26ed2a69a8",
+ "sha256:ef4163770525257876f10e8ece1cf25b71468316f61451ded1a6f44273eedeb5"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.10.31"
},
"requests": {
"hashes": [
- "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
- "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.25.1"
+ "version": "==2.28.1"
},
"requests-file": {
"hashes": [
@@ -760,226 +1112,189 @@
},
"s3transfer": {
"hashes": [
- "sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13",
- "sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db"
+ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
+ "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
],
- "version": "==0.3.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==0.6.0"
},
"schedule": {
"hashes": [
- "sha256:3f895a1036799a25ab9c335de917073e63cf8256920917e932777382f101f08f",
- "sha256:f9fb5181283de4db6e701d476dd01b6a3dd81c38462a54991ddbb9d26db857c9"
+ "sha256:617adce8b4bf38c360b781297d59918fbebfb2878f1671d189f4f4af5d0567a4",
+ "sha256:e6ca13585e62c810e13a08682e0a6a8ad245372e376ba2b8679294f377dfc8e4"
],
- "version": "==0.6.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.1.0"
},
"schema": {
"hashes": [
- "sha256:3a03c2e2b22e6a331ae73750ab1da46916da6ca861b16e6f073ac1d1eba43b71",
- "sha256:b536f2375b49fdf56f36279addae98bd86a8afbd58b3c32ce363c464bed5fc1c"
+ "sha256:f06717112c61895cabc4707752b88716e8420a8819d71404501e114f91043197",
+ "sha256:f3ffdeeada09ec34bf40d7d79996d9f7175db93b7a5065de0faa7f41083c1e6c"
],
- "version": "==0.7.2"
+ "version": "==0.7.5"
},
"selectolax": {
"hashes": [
- "sha256:01b26820667dcd8dc0ec94ed874ffc8e45043f0da70466544a9328a79282ff41",
- "sha256:03e4c0b6d8feb16472482c89a1bf0752335d015172263388c94b8089224ed9e6",
- "sha256:061efe18e01a624e33317d1f98f20d67f025e228f5cf87f198caceadff9e77f5",
- "sha256:108f0ed757c5e74cd3d15f3ddb615c891711ae9647fb002aca6dbad5c7f0084c",
- "sha256:13e6a6ec4b8fc43ef3f6586e17ba85832bbcdf8074d9a31a159d87dd81bf2627",
- "sha256:231ce804a5e186afa4e7f1639f3a2fdefc5151c1094746fa09821c7c9f5dbeb6",
- "sha256:274d70e46a94a7b673585957574e571b1838afb5862b9edc7477f704a2e8be3f",
- "sha256:290b9bc9df879c8538899b5d22e8fa272e07c9edc438396d9b9ad631a7689837",
- "sha256:2fe935472e9c2c14caf38b65a5ea836f0c3d56081945a8588e14f4136e34ba6b",
- "sha256:37cb0fd1d933ad7321caa68773fda490d686286eaf4d77922686ad14506c4a2c",
- "sha256:38661265f318459cd93b1a87b20d8b7b5adeaa353cc96e2d5087a05eef9ce8a3",
- "sha256:3b21ba8862be4445482e6954c61562851cebd9c9e5db73b0865ea4729e7c85b0",
- "sha256:4208bfab7c5e14d54104b7959ba1d66f67a51044cb1fccbab62d12c6bd905f02",
- "sha256:4233599d6507e11a6fab67d9e933d8f445859868b4162eb71c849a832935b575",
- "sha256:4714c5e6b18ad0ca9f2919b39f333590025e46cb0bb248ffe973333bbf18a491",
- "sha256:4b9f60a689c0453b6e2a6b92dd2407c82167f3d7624b27184842b2b58d5bc353",
- "sha256:4ebb88d954dabffa3bafad6cdd758612a7d3b84ceee692c5818bbf0fa93c5f6b",
- "sha256:519335c313c49151e0a282bef88043eab8756732f24eeb42d2a17e68b3ab174e",
- "sha256:5e2fb6a27bc7760d57f8cc53adcf5b300a021a3f4102df0e5dd8abb436041c28",
- "sha256:60ba2ce5060bac7d56dedefe1403602aac1b999a60596294ce3a9520e2c95d71",
- "sha256:6f7f7a1a030c5612529c0e9df46d690b54d22416d500095ddf3985527f8fb78f",
- "sha256:804f8e954428a1a325a62a88af39e1fef87c22f0689ee3c3a1d8968ee9648f6e",
- "sha256:88cc811bb3f9c4eac303dde5ba3ecde0972dba8cebf2fb8001467e752c888838",
- "sha256:8b85a1356e180d235d9ab92bc3dd90d07e78cab1ef324ae9d12207607c9f26f6",
- "sha256:8d8e3c7c43805628f2112cda81dba0b8f6620912c24ab2d6635f351985097971",
- "sha256:90da202496bb99a0924cd26c471f455f64308ed13a24500852635aef5014a43f",
- "sha256:98e8b60fca5ca6e2f0a2a1882f0c1b771612e5016bd6605545e7c20a8baac244",
- "sha256:a18f75af342476356e5a437fc5215a3b79b58f52b56d9ea6e1a985cc21895952",
- "sha256:a36581e0a4f74c5a67d22048fbf34221f9d480bde05acc57702b1cffdcb9ecf5",
- "sha256:a6724cb313cd7805c7cf4252fdf162e7253cf3a933b7c25ac954feed3edc23ce",
- "sha256:b8632b165d5da9ecbfb671dbfa879a874cd63d2ea66a8d21b065da1236949947",
- "sha256:bba6127957c3209e141e42077d952cb1df4a5dc23c522ca9038c8013509588d8",
- "sha256:c47c7602e8cf8bdce03716b0240d2067eec92f3185cffe34813c60706559ae6a",
- "sha256:c49ac91cb291eae5c396aa87725ad066ba2fd9690289f3ffcde0022e4276b56e",
- "sha256:d4144619f88bb94ee2c29cccc23b00a020d6d140d84eda8d7fc4da05dc15f352",
- "sha256:df57fdbbf772b72993e44cdb326b4937d225e0dd2083cce56100593fe791326a",
- "sha256:e577ea359151e4df515eabc0c6ea1ddda0577971597c5e9908498a80477befc6",
- "sha256:e6857ac61acbf747ea56f6c8a72968e7a6ba88053a9a2b5b44091bfb97fb1c87",
- "sha256:ec2e3f6e49ee252c2fd0c0f297513150ec04e59c7aa0236baebeaaf21b83ffef",
- "sha256:ecdbad6c95b93256df4c3cb14612215bcd754093415615c55a191bb17fd0ebdc",
- "sha256:f22653fd48a7f835891bab16095c6f983994d68d16925447e537eb6e3ab79fc4",
- "sha256:f6c637636cc3bd0025dc9bd07fde28d482c93a6c21cf2e88b827a06766b2b314",
- "sha256:f83648e412aa610bdff1259dc412383fb290427c05f54e4fad1419b16aca19fe",
- "sha256:f8f8488fa5859b0da7e4a1bd265b5c0bba45dbf8286e6cee17bf95bcb3d5e797"
+ "sha256:010b008aca04be6cf9727d6f206a583d79a82d397126a101f57f117113a082bb",
+ "sha256:0878aa1ab3906831b20ad9e316a77c8401030dd388f3c1c72ba51bc08d497584",
+ "sha256:087e663c0ba6d9d79294508b0a3145079e838950a0e2fc7b8b1485da3fe24254",
+ "sha256:0a8dddd34dea642429629aae21cf940668eaa1c66ab0bcf9970d72f38676697d",
+ "sha256:14c9368f9dd224f895ef1431b1961d6e9a56fb26a95b5c04900def7b8961744c",
+ "sha256:17ac0b2b4222ba2c16852c0035dcd31d9e100544e6a5138f6e01f6b1648691b5",
+ "sha256:1ba1cd707a0d0090cffb2851ec6ccfdc334ed0c2ea08ae8705a9f6c97a997f77",
+ "sha256:1d38157e2358dacf55e782d332b41391821b2ef237e34e47ff276b2184c96542",
+ "sha256:1f1ec20cc75e1866f7758e543907da222c5d8072e580cf6814f2f142036c695f",
+ "sha256:1fa1737b7031b467d8613919503c85482a59c65ac91fe60074180e625e2533c6",
+ "sha256:221051ffe8c2950e9ebe41e08103397a7b287dca05a9e8084bb9e925f2d9c556",
+ "sha256:264918c1e9e6f6657f47116e4dbd74b57c660d3e86f9cc78209f132c56c8e9e5",
+ "sha256:2d8c7ce06bdf83d3cd2a617211eec48c875826bae54c74e56aec2635daac2f31",
+ "sha256:31fb0fbc88674b3346e379664c5837070e79b2f65eab3e29b7c43e1b4fc1137c",
+ "sha256:3600747c5072725580f8dc249a40ae123840f22edab950f43b349d356f44268b",
+ "sha256:3d65d0c57cfa1b05beb5c72d3cb566f4fdaf16e5112082f300cfa6bd94836aff",
+ "sha256:3daaf7ec54565d3f15f9ce046f6a8e469d966dc4fc879af8c7f753d37994f70e",
+ "sha256:418738a2f46beea2444a1587adb4f509bdd8e7ddffac071dba097c1a3ddb8cfc",
+ "sha256:46776ca482a76b3f522e4d8f90474716e4da51dc2823f3ecc6a2ff38ef0663b7",
+ "sha256:46bacca9e9f077ff2c5a973c05b8862425f077c58f2dca8059b992ceaca6b6de",
+ "sha256:4c5c68f0139d0928298ef5e95137996e0efb6f8db364b1470221e8710834a0ab",
+ "sha256:51c33d33e4e4eec0d9c1b6accdda5c93f4e3a00b28e99fc4ebb2b95d1d4ef885",
+ "sha256:585a75f4aff85b48d0fc8f3e9afbd1e2c05902a332982d04bab93e8e1db2e4a4",
+ "sha256:5acbe02c26b43428c2f49e8f09a81bd47be7ea969c6798cde1a23c2b33d25c79",
+ "sha256:6111ac9e5ca02b13d8e3057c1e20d6608435c64a11f92460a59951a7209c2cf3",
+ "sha256:67c32c29bc9011ed1b6fd67a961073e69d67bf60bf09f3db54d6240c034719f4",
+ "sha256:68c42af2cabecf04528dff2d0bbebbecfbafc394a5192b6a5b3e1dcd19eeb766",
+ "sha256:709b1680a16f210c43e4f3240dfc15e3312ccd43c9ea20c8e20c81470214cfc6",
+ "sha256:762e91a0ac0caa2d8731568e5b2ad0cec6fc06465a9dd89280118ced4b7e0849",
+ "sha256:7d47e489a8b0181992a3384987c854bd88211685e1c32dcdcb8746ec98dbcf7e",
+ "sha256:7ebe824763782f0e6ad2accd57d0cef3a61922b72be99ccafebe0154e9b8aef6",
+ "sha256:7f1a35be9413bcd56f225b1509740ea8999a6f7558e0f0a50a4ca80b91bf11be",
+ "sha256:81c7847ff0f3561559bd98015aa3fe0a2dfb26966156f7704f7f65339d48e81c",
+ "sha256:9246bf586afaacfdc0e6fb17806ee0d3e1736d3d13a87c8e96214596d50576b7",
+ "sha256:9baff22ae7015e8f2697d5db0804ee379d53fa6e54f1dc7e9f61ee8ccb1bdb2e",
+ "sha256:a4634d7c7e9d2eb65d0fc7fe0d88641eb413cb7250fbfc66b3b4d88d49e4c724",
+ "sha256:a7fa03253260c3351f61cef36865b27ad4585516e9ac4a77244d237bfaf37f13",
+ "sha256:abac4b7afe430dd135f148d4001b593b09c8f64fccd63b15fbb03b77735e3405",
+ "sha256:ad0cfc7f66a2863d199af819c79bfa160bcc830e0f83fd5391cdd80e545af758",
+ "sha256:adabfb5635d00da49bddef3844dc65ca3da81acd889ea7be2a74ef9456558f36",
+ "sha256:ae58e7cc282a768a68abbfa39eff895788a39658c5a235524c21b09d182b3d3a",
+ "sha256:b348074bc3a0e16e9af1a2f57e0da18f5def97e415c6435dadc68aead7ccf060",
+ "sha256:b48e4c8df2c226552ac18636c2ebe9d100ff3daa8742616687bd2cbf74a81e2f",
+ "sha256:c23d9f82aea887347151538a58b15a8dbee4261e4114705c0974dee81eb796e0",
+ "sha256:c2b589be0dd45d62ec43a6446f09919b5be809c708d8ff6a7cb86acd9150091b",
+ "sha256:d13904fc037bcebc6d79e83c0a19e64cc9d4771cd7f27b325c63d1071ec0d0f0",
+ "sha256:d3506e831b972c1eb22538b25e7c991289b72b2e028bd27b633dfbd21c1a511a",
+ "sha256:d809fbf258c28190160b3fe5d34adddb1da44ed7a2f800b7125e0fac6e940016",
+ "sha256:da688ca957d68b8072dc9658506c07326f6332ff3fe03214fec375a4ccc67f8a",
+ "sha256:e001a40b25e478f8390c3898c5852cf9a226668ba02fdc4d8e3a4788ce64207a",
+ "sha256:e805b106edac716047afc6e9e49953242207909bfbb70bf47c53f231e2d27d74",
+ "sha256:eb86cacac6ed203c386afe6704732fb05d831006c65869f15f41d15e9e72973b",
+ "sha256:f5cef3310fc41f71e8fc19d05534d100f6c02789d46041777b0bbd70961a94ec",
+ "sha256:f76b0ad63b55e45d3c02e50ca8b8ef64a500aed9a5f50818173b66949470f8e4",
+ "sha256:fad7fb68e929082e6474e1392dd433d465b06b59e26158ef67813c0c8e5b7f66",
+ "sha256:fb3b3425ee21f5098531ce80dc48d99a555b8b2300deb0ddf84b6bc503f0a848",
+ "sha256:fc53731aa81617694667d4c56d21a9e26df840a219f4b62588af80c6781ba613"
],
"index": "ia",
- "version": "==0.2.10"
+ "version": "==0.3.11"
},
"sentry-sdk": {
"extras": [],
"hashes": [
- "sha256:0a711ec952441c2ec89b8f5d226c33bc697914f46e876b44a4edd3e7864cf4d0",
- "sha256:737a094e49a529dd0fdcaafa9e97cf7c3d5eb964bd229821d640bc77f3502b3f"
+ "sha256:5bbe4b72de22f9ac1e67f2a4e6efe8fbd595bb59b7b223443f50fe5802a5551c",
+ "sha256:9f0b960694e2d8bb04db4ba6ac2a645040caef4e762c65937998ff06064f10d6"
],
"index": "ia",
- "version": "==0.19.5"
+ "version": "==1.12.1"
},
"six": {
"hashes": [
- "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
- "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
- "version": "==1.15.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==1.16.0"
},
"soupsieve": {
"hashes": [
- "sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851",
- "sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"
- ],
- "markers": "python_version >= '3.0'",
- "version": "==2.1"
- },
- "sqlalchemy": {
- "hashes": [
- "sha256:04f995fcbf54e46cddeb4f75ce9dfc17075d6ae04ac23b2bacb44b3bc6f6bf11",
- "sha256:0c6406a78a714a540d980a680b86654feadb81c8d0eecb59f3d6c554a4c69f19",
- "sha256:0c72b90988be749e04eff0342dcc98c18a14461eb4b2ad59d611b57b31120f90",
- "sha256:108580808803c7732f34798eb4a329d45b04c562ed83ee90f09f6a184a42b766",
- "sha256:1418f5e71d6081aa1095a1d6b567a562d2761996710bdce9b6e6ba20a03d0864",
- "sha256:17610d573e698bf395afbbff946544fbce7c5f4ee77b5bcb1f821b36345fae7a",
- "sha256:216ba5b4299c95ed179b58f298bda885a476b16288ab7243e89f29f6aeced7e0",
- "sha256:2ff132a379838b1abf83c065be54cef32b47c987aedd06b82fc76476c85225eb",
- "sha256:314f5042c0b047438e19401d5f29757a511cfc2f0c40d28047ca0e4c95eabb5b",
- "sha256:318b5b727e00662e5fc4b4cd2bf58a5116d7c1b4dd56ffaa7d68f43458a8d1ed",
- "sha256:3ab5b44a07b8c562c6dcb7433c6a6c6e03266d19d64f87b3333eda34e3b9936b",
- "sha256:426ece890153ccc52cc5151a1a0ed540a5a7825414139bb4c95a868d8da54a52",
- "sha256:491fe48adc07d13e020a8b07ef82eefc227003a046809c121bea81d3dbf1832d",
- "sha256:4a84c7c7658dd22a33dab2e2aa2d17c18cb004a42388246f2e87cb4085ef2811",
- "sha256:54da615e5b92c339e339fe8536cce99fe823b6ed505d4ea344852aefa1c205fb",
- "sha256:5a7f224cdb7233182cec2a45d4c633951268d6a9bcedac37abbf79dd07012aea",
- "sha256:61628715931f4962e0cdb2a7c87ff39eea320d2aa96bd471a3c293d146f90394",
- "sha256:62285607a5264d1f91590abd874d6a498e229d5840669bd7d9f654cfaa599bd0",
- "sha256:62fb881ba51dbacba9af9b779211cf9acff3442d4f2993142015b22b3cd1f92a",
- "sha256:68428818cf80c60dc04aa0f38da20ad39b28aba4d4d199f949e7d6e04444ea86",
- "sha256:6aaa13ee40c4552d5f3a59f543f0db6e31712cc4009ec7385407be4627259d41",
- "sha256:70121f0ae48b25ef3e56e477b88cd0b0af0e1f3a53b5554071aa6a93ef378a03",
- "sha256:715b34578cc740b743361f7c3e5f584b04b0f1344f45afc4e87fbac4802eb0a0",
- "sha256:758fc8c4d6c0336e617f9f6919f9daea3ab6bb9b07005eda9a1a682e24a6cacc",
- "sha256:7d4b8de6bb0bc736161cb0bbd95366b11b3eb24dd6b814a143d8375e75af9990",
- "sha256:81d8d099a49f83111cce55ec03cc87eef45eec0d90f9842b4fc674f860b857b0",
- "sha256:888d5b4b5aeed0d3449de93ea80173653e939e916cc95fe8527079e50235c1d2",
- "sha256:95bde07d19c146d608bccb9b16e144ec8f139bcfe7fd72331858698a71c9b4f5",
- "sha256:9bf572e4f5aa23f88dd902f10bb103cb5979022a38eec684bfa6d61851173fec",
- "sha256:bab5a1e15b9466a25c96cda19139f3beb3e669794373b9ce28c4cf158c6e841d",
- "sha256:bd4b1af45fd322dcd1fb2a9195b4f93f570d1a5902a842e3e6051385fac88f9c",
- "sha256:bde677047305fe76c7ee3e4492b545e0018918e44141cc154fe39e124e433991",
- "sha256:c389d7cc2b821853fb018c85457da3e7941db64f4387720a329bc7ff06a27963",
- "sha256:d055ff750fcab69ca4e57b656d9c6ad33682e9b8d564f2fbe667ab95c63591b0",
- "sha256:d53f59744b01f1440a1b0973ed2c3a7de204135c593299ee997828aad5191693",
- "sha256:f115150cc4361dd46153302a640c7fa1804ac207f9cc356228248e351a8b4676",
- "sha256:f1e88b30da8163215eab643962ae9d9252e47b4ea53404f2c4f10f24e70ddc62",
- "sha256:f8191fef303025879e6c3548ecd8a95aafc0728c764ab72ec51a0bdf0c91a341"
- ],
- "version": "==1.3.22"
+ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
+ "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2.3.2.post1"
},
"surt": {
"hashes": [
- "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720",
- "sha256:5691e63b189af04aa1fb178ecce5fc7d872cc582e2b6861d4500f6d41915306a"
+ "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720"
],
"version": "==0.3.1"
},
- "tldextract": {
+ "tld": {
"hashes": [
- "sha256:cfae9bc8bda37c3e8c7c8639711ad20e95dc85b207a256b60b0b23d7ff5540ea",
- "sha256:e57f22b6d00a28c21673d2048112f1bdcb6a14d4711568305f6bb96cf5bb53a1"
+ "sha256:266106ad9035f54cd5cce5f823911a51f697e7c58cb45bfbd6c53b4c2976ece2",
+ "sha256:69fed19d26bb3f715366fb4af66fdeace896c55c052b00e8aaba3a7b63f3e7f0",
+ "sha256:826bbe61dccc8d63144b51caef83e1373fbaac6f9ada46fca7846021f5d36fef",
+ "sha256:843844e4256c943983d86366b5af3ac9cd1c9a0b6465f04d9f70e3b4c1a7989f",
+ "sha256:a92ac6b84917e7d9e934434b8d37e9be534598f138fbb86b3c0d5426f2621890",
+ "sha256:b6650f2d5392a49760064bc55d73ce3397a378ef24ded96efb516c6b8ec68c26",
+ "sha256:ef5b162d6fa295822dacd4fe4df1b62d8df2550795a97399a8905821b58d3702"
],
- "version": "==3.1.0"
+ "markers": "python_version >= '2.7' and python_version < '4'",
+ "version": "==0.12.6"
},
- "toml": {
+ "tldextract": {
"hashes": [
- "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
- "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+ "sha256:47aa4d8f1a4da79a44529c9a2ddc518663b25d371b805194ec5ce2a5f615ccd2",
+ "sha256:78aef13ac1459d519b457a03f1f74c1bf1c2808122a6bcc0e6840f81ba55ad73"
],
- "version": "==0.10.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.4.0"
},
"tqdm": {
"hashes": [
- "sha256:0cd81710de29754bf17b6fee07bdb86f956b4fa20d3078f02040f83e64309416",
- "sha256:f4f80b96e2ceafea69add7bf971b8403b9cba8fb4451c1220f91c79be4ebd208"
+ "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4",
+ "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"
],
- "version": "==4.55.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==4.64.1"
},
"trafilatura": {
"hashes": [
- "sha256:0561c80c284bd3facdb7e16dbc3105758548d1548bbc92beb3522c8ac2c27cea",
- "sha256:70fc752049772ec17225417d211af460f9e0a44e0d041ffb6f823fc25081df3b"
+ "sha256:a66189e4b9d591dce648f0cc79fb52a486e679708090189bc4fcd88068f095ef",
+ "sha256:c2bc0cbac6248363d938666cbedbb067ad8aefe31667c88038135b93efd475c3"
],
"index": "ia",
- "version": "==0.6.1"
+ "version": "==1.3.0"
},
"twitter": {
"hashes": [
- "sha256:52545fd3b70d3d3807d3ce62d1a256727856d784d1630d64dedcc643aaf0b908",
- "sha256:acdc85e5beea752967bb64c63bde8b915c49a31a01db1b2fecccf9f2c1d5c44d"
- ],
- "version": "==1.18.0"
- },
- "typed-ast": {
- "hashes": [
- "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
- "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
- "sha256:0d8110d78a5736e16e26213114a38ca35cb15b6515d535413b090bd50951556d",
- "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
- "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
- "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
- "sha256:3742b32cf1c6ef124d57f95be609c473d7ec4c14d0090e5a5e05a15269fb4d0c",
- "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
- "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
- "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
- "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
- "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
- "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
- "sha256:7e4c9d7658aaa1fc80018593abdf8598bf91325af6af5cce4ce7c73bc45ea53d",
- "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
- "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
- "sha256:92c325624e304ebf0e025d1224b77dd4e6393f18aab8d829b5b7e04afe9b7a2c",
- "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
- "sha256:b52ccf7cfe4ce2a1064b18594381bccf4179c2ecf7f513134ec2f993dd4ab395",
- "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
- "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
- "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
- "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
- "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
- "sha256:d648b8e3bf2fe648745c8ffcee3db3ff903d0817a01a12dd6a6ea7a8f4889072",
- "sha256:f208eb7aff048f6bea9586e61af041ddf7f9ade7caed625742af423f6bae3298",
- "sha256:fac11badff8313e23717f3dada86a15389d0708275bddf766cca67a84ead3e91",
- "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
- "sha256:fcf135e17cc74dbfbc05894ebca928ffeb23d9790b3167a674921db19082401f",
- "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
- ],
- "version": "==1.4.1"
+ "sha256:1d9a3e45f2c440f308a7116d3672b0d1981aba8ac41cb7f3ed270ed50693f0e0",
+ "sha256:80ddd69ae2eeb88313feedeea31bf119fd6e79541ee5b37abb9c43d233194e10"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==1.19.6"
+ },
+ "typing-extensions": {
+ "hashes": [
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
+ },
+ "tzdata": {
+ "hashes": [
+ "sha256:2b88858b0e3120792a3c0635c23daf36a7d7eeeca657c323da299d2094402a0d",
+ "sha256:fe5f866eddd8b96e9fcba978f8e503c909b19ea7efda11e52e39494bad3a7bfa"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.7"
},
"tzlocal": {
"hashes": [
- "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
- "sha256:e2cb6c6b5b604af38597403e9852872d7f534962ae2954c7f35efcb1ccacf4a4"
+ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
+ "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
- "version": "==2.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.2"
},
"urlcanon": {
"hashes": [
@@ -990,11 +1305,11 @@
},
"urllib3": {
"hashes": [
- "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08",
- "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version != '3.4'",
- "version": "==1.26.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"warctools": {
"hashes": [
@@ -1007,28 +1322,22 @@
"brotli"
],
"hashes": [
- "sha256:f8979bb2a20434e6af5d14d3cfd0058fd81cfd3c5ecefc7babb0fedb3ac320a1"
+ "sha256:3a3f149508d68ec53f5cdf434a45e5bb906beef731327d7bd2ef6b751c98281b"
],
"index": "ia",
- "version": "==0.6.4"
- },
- "wayback-esp": {
- "hashes": [
- "sha256:4b735baecf34d4ad08ec4338891fbcb7879f9a58441bc5c488d067f7e2cd5318"
- ],
- "version": "==0.2.12"
+ "version": "==0.8.6.1"
},
"wayback-search-js": {
"hashes": [
- "sha256:e3f48201b02f2ae437f4e1c171c9d7bd3d8126437510c933c3e3b45334552491"
+ "sha256:a474ba8da58f9cc27b1dce7f87a8cc7d119715ab4bab750dcc1d90f002074161"
],
- "version": "==2.13.0"
+ "version": "==3.1.21"
},
"wbex-client": {
"hashes": [
- "sha256:619ead0408195f4eb87198a99e497c649961da45fcf97cb9bc937ef9e06a9e7f"
+ "sha256:8c4028d744dda05cca932b411a826f9478a65cbc018784bff9528e973c7f9c36"
],
- "version": "==0.1.6"
+ "version": "==0.1.6.1"
},
"wcwidth": {
"hashes": [
@@ -1039,89 +1348,93 @@
},
"werkzeug": {
"hashes": [
- "sha256:2de2a5db0baeae7b2d2664949077c2ac63fbd16d98da0ff71837f7d1dea3fd43",
- "sha256:6c80b1e5ad3665290ea39320b91e1be1e0d5f60652b964a3070216de83d2e47c"
+ "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
+ "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c"
],
- "version": "==1.0.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.0.3"
},
"zstandard": {
"hashes": [
- "sha256:064aac12b8e7813fa3870e7479e9cbd3803e33212b68e555b408711ea8f6cb54",
- "sha256:0b3ae587556a6f45cd982d7684b1318793430d0ae9e376dbc3d877b48ac6d576",
- "sha256:0b57df1f9530669d61f8708eb15ded6584db4a6733cc5155eb8561d31f292557",
- "sha256:0e5b8fd428d0d00fb7dabc0898de9e87659cb54738d527becff37d3d90df8e88",
- "sha256:17e8f29aae79d870daa3ab48c0dbf83594bf956c2c2125ae45cdfebd2b62d8ed",
- "sha256:1be45b237fea45c705d83215450a9381c2787bbf0720824b1fe23ed72f8db0b7",
- "sha256:1c065de617b7367c4da4de687a071932e48ae200d09c0afbc24415d98aec470d",
- "sha256:2075be64372206af3df40fef0fee657b44845d3e6d98b4cc8aba220be861de2d",
- "sha256:24ab8f1c7c970822bd55dbb091f7eb271b417e777e8b3ae6722e60d67f747c05",
- "sha256:2826d664eb84f9efe0fae47cf20c27f3662aae3556fbcc4cecd5318fbc9239f3",
- "sha256:3382ce6e44e9e847dce848bc2638403aa9320cb38edcc34b71e13be5793619e0",
- "sha256:36cd223d7fd0fe0e32e82993240e9a24503269c93431e62369088e2299cf4605",
- "sha256:391c30620e3ad6bc53804f32e3f74cbbaa713d95f46ac5f2e54e735d1dfc51c0",
- "sha256:3948000d753b9110e1eb43a6cba6fdb64c895faebb47628a96550edc5238b78a",
- "sha256:3bd044ef32bd6738c3db2cb2d4bc77812e9a3132df942303bbfcd1a484023b60",
- "sha256:403fa9544ecdedcc5fdc48f5e41e092658ac48222cfe6e75fb5710cb3d14c700",
- "sha256:41eab10e6570e14dd77a346f3dbb1eab3f23a652bce07ba47c8c23116b0cee9c",
- "sha256:477db538b596767d036379165a27aa2e19edbae50bec4cea195a986ba50bbad6",
- "sha256:4b054fd8cf274b958a3d7a201f8b42a30ebf8f76d87770075e1aca6017006e97",
- "sha256:4e6d6b0e541b00d0096a260d5f6eb32f737bfcdb2e5b87a7b7be77ef669c7a6c",
- "sha256:5be097127be1659bc6cffb5d885c781e61947597e2fcd1ecf48713313e53657d",
- "sha256:5dd700e52ec28c64d43f681ccde76b6436c8f89a332d6c9e22a6b629f28daeb5",
- "sha256:657a49b1df5a82985ea6495c6c1497a17e34e41a0bd8ef95a640342a19b8e6a4",
- "sha256:6f437168752e50ad6a47d054f4a41933693b1675f65663c117067747d95f057c",
- "sha256:70dfe74b24971476a6a20d42abb964c9ac0fb1af7b89228e5845748377543bd0",
- "sha256:7161d71debb94c456cbddd8a239e89219f37f0b1a4c0620a2c1400801aeeec7d",
- "sha256:7309bf511c8b332be2b5a834efbd7ee0cd43db2c811dd916fd0f48acd43e8722",
- "sha256:7c3c9657417bf1eccb94ad64544e12efa8ea3e16612944b32e253314472a54e5",
- "sha256:8486a01696e3cdfa47b93caa8f5064c9d277bad1c39eb31947bf2b8f019e3510",
- "sha256:85b37acd054f8f778e5c9832e17fb651f321a3daafa0eb94360eeffce141b0cf",
- "sha256:8df3114dfff411aa9827d754bb8fdcdaa15e63c96d7730778fe322f4c85360d8",
- "sha256:90f0bb1adcfea326c6548a45cc35474bec56a34d80310b6e78abab313da780fc",
- "sha256:95939a7e3972ec20e2e959ee9cd0fd858b25ff3a6f5040c5c78fcab51eeab030",
- "sha256:a51a09a3be208e627ebb518a78c639d240584f5d1da8106dcafa31d22103b4df",
- "sha256:a72cb707cc0a9d06e3912fe5b6c1648d70ac512f3e180018c82fe926926be12c",
- "sha256:a820ef78f39c29469caacb0bf43ffd024b78f242393c605daa748588b3247306",
- "sha256:aab21dd5724aa5bdd0aac16f5d175e5df0715fc614910220a918d50f08321982",
- "sha256:ac9b88a72f2dcfa3facbe6af96d59e82459e5815c15aa59481cc6080937ee02e",
- "sha256:b508a826c4b99835e3d8a8d415a6e516cacad4a95ef5ed01f60f9b067f200a51",
- "sha256:bd4da25cc46e972b029f8aa9f103c5977dbe461e1916ff7edec24065071b4a08",
- "sha256:cf67443d06b88218eb8915da2d968dcf6fdc384fb245f97155617ff3b8d77e92",
- "sha256:d2db7bcdc9b3e5a782d71df0163a6587b8b2f759cc4a819859e27e6ad2f778e6",
- "sha256:d2ec8309309fc7254d21286d6b3e5c28e4019cd8e266d1a860456a69ea7c2400",
- "sha256:d2fd76d29f4e8d7c4aac42617a0439506144146032b5d7b9b0a42f37f916fdb2",
- "sha256:d34848645f3507dc85baa8c67426f0685b08583e930fa3a1ab5048c5f0ba8fc1",
- "sha256:d3999f92ab7aab2a99ac7f7730b3bee8d6bd3e52953ed0e87ab881ca4244a315",
- "sha256:d4a7065d7fc991edb93483dbb7bc37dd091a2bac9572d9b9df243e6565d30522",
- "sha256:d78db92ac27cdcd55333b7e642cd400719842e692e8836f0b249e459b26d384b",
- "sha256:d7fecb5172dc885665581437fe96bf8f03ffc0022b723964c272accbb62713b4",
- "sha256:db1b3442441577d81bdae85fc7a4bd553e3161ec745e9dd1f2f93889248363fe",
- "sha256:e3731e0dc1c200e5c2f56ca36bed6c28903f764769f534fbf9ed4178f193e8aa",
- "sha256:e3963c919f65367587cf987a71991e69385f19cec9ad8166249b83e176cdbcd8",
- "sha256:e80ade52a06fb433c9ad7d6c8cfb3dafa34f05bedce543e95a670972ba41d65d",
- "sha256:ec1a20936484f3804fba4f29f7d8ed67c70e44536b0f0191a13eff4dc61c815c",
- "sha256:ed14a62f8bf2462f19373c337527ff684deb6d0d6b973fbcaece1f561c30f405",
- "sha256:fb5f0d29bcbfba6ef9beccba55f567d089747034add5cd7e8dc58842bb745803",
- "sha256:fbbe18afb67329577ab6a907f348175d3f6044d179a9b56b02206ff9e67c5b12"
+ "sha256:04c298d381a3b6274b0a8001f0da0ec7819d052ad9c3b0863fe8c7f154061f76",
+ "sha256:0fde1c56ec118940974e726c2a27e5b54e71e16c6f81d0b4722112b91d2d9009",
+ "sha256:126aa8433773efad0871f624339c7984a9c43913952f77d5abeee7f95a0c0860",
+ "sha256:1a4fb8b4ac6772e4d656103ccaf2e43e45bd16b5da324b963d58ef360d09eb73",
+ "sha256:2e4812720582d0803e84aefa2ac48ce1e1e6e200ca3ce1ae2be6d410c1d637ae",
+ "sha256:2f01b27d0b453f07cbcff01405cdd007e71f5d6410eb01303a16ba19213e58e4",
+ "sha256:31d12fcd942dd8dbf52ca5f6b1bbe287f44e5d551a081a983ff3ea2082867863",
+ "sha256:3c927b6aa682c6d96225e1c797f4a5d0b9f777b327dea912b23471aaf5385376",
+ "sha256:3d5bb598963ac1f1f5b72dd006adb46ca6203e4fb7269a5b6e1f99e85b07ad38",
+ "sha256:401508efe02341ae681752a87e8ac9ef76df85ef1a238a7a21786a489d2c983d",
+ "sha256:4514b19abe6dbd36d6c5d75c54faca24b1ceb3999193c5b1f4b685abeabde3d0",
+ "sha256:47dfa52bed3097c705451bafd56dac26535545a987b6759fa39da1602349d7ba",
+ "sha256:4fa496d2d674c6e9cffc561639d17009d29adee84a27cf1e12d3c9be14aa8feb",
+ "sha256:55a513ec67e85abd8b8b83af8813368036f03e2d29a50fc94033504918273980",
+ "sha256:55b3187e0bed004533149882ef8c24e954321f3be81f8a9ceffe35099b82a0d0",
+ "sha256:593f96718ad906e24d6534187fdade28b611f8ed06e27ba972ba48aecec45fc6",
+ "sha256:5e21032efe673b887464667d09406bab6e16d96b09ad87e80859e3a20b6745b6",
+ "sha256:60a86b7b2b1c300779167cf595e019e61afcc0e20c4838692983a921db9006ac",
+ "sha256:619f9bf37cdb4c3dc9d4120d2a1003f5db9446f3618a323219f408f6a9df6725",
+ "sha256:660b91eca10ee1b44c47843894abe3e6cfd80e50c90dee3123befbf7ca486bd3",
+ "sha256:67710d220af405f5ce22712fa741d85e8b3ada7a457ea419b038469ba379837c",
+ "sha256:6caed86cd47ae93915d9031dc04be5283c275e1a2af2ceff33932071f3eeff4d",
+ "sha256:6d2182e648e79213b3881998b30225b3f4b1f3e681f1c1eaf4cacf19bde1040d",
+ "sha256:72758c9f785831d9d744af282d54c3e0f9db34f7eae521c33798695464993da2",
+ "sha256:74c2637d12eaacb503b0b06efdf55199a11b1d7c580bd3dd9dfe84cac97ef2f6",
+ "sha256:755020d5aeb1b10bffd93d119e7709a2a7475b6ad79c8d5226cea3f76d152ce0",
+ "sha256:7ccc4727300f223184520a6064c161a90b5d0283accd72d1455bcd85ec44dd0d",
+ "sha256:81ab21d03e3b0351847a86a0b298b297fde1e152752614138021d6d16a476ea6",
+ "sha256:8371217dff635cfc0220db2720fc3ce728cd47e72bb7572cca035332823dbdfc",
+ "sha256:876567136b0359f6581ecd892bdb4ca03a0eead0265db73206c78cff03bcdb0f",
+ "sha256:879411d04068bd489db57dcf6b82ffad3c5fb2a1fdd30817c566d8b7bedee442",
+ "sha256:898500957ae5e7f31b7271ace4e6f3625b38c0ac84e8cedde8de3a77a7fdae5e",
+ "sha256:8c9ca56345b0c5574db47560603de9d05f63cce5dfeb3a456eb60f3fec737ff2",
+ "sha256:8ec2c146e10b59c376b6bc0369929647fcd95404a503a7aa0990f21c16462248",
+ "sha256:8f7c68de4f362c1b2f426395fe4e05028c56d0782b2ec3ae18a5416eaf775576",
+ "sha256:909bdd4e19ea437eb9b45d6695d722f6f0fd9d8f493e837d70f92062b9f39faf",
+ "sha256:9d97c713433087ba5cee61a3e8edb54029753d45a4288ad61a176fa4718033ce",
+ "sha256:a65e0119ad39e855427520f7829618f78eb2824aa05e63ff19b466080cd99210",
+ "sha256:aa9087571729c968cd853d54b3f6e9d0ec61e45cd2c31e0eb8a0d4bdbbe6da2f",
+ "sha256:aef0889417eda2db000d791f9739f5cecb9ccdd45c98f82c6be531bdc67ff0f2",
+ "sha256:b253d0c53c8ee12c3e53d181fb9ef6ce2cd9c41cbca1c56a535e4fc8ec41e241",
+ "sha256:b80f6f6478f9d4ca26daee6c61584499493bf97950cfaa1a02b16bb5c2c17e70",
+ "sha256:be6329b5ba18ec5d32dc26181e0148e423347ed936dda48bf49fb243895d1566",
+ "sha256:c7560f622e3849cc8f3e999791a915addd08fafe80b47fcf3ffbda5b5151047c",
+ "sha256:d1a7a716bb04b1c3c4a707e38e2dee46ac544fff931e66d7ae944f3019fc55b8",
+ "sha256:d63b04e16df8ea21dfcedbf5a60e11cbba9d835d44cb3cbff233cfd037a916d5",
+ "sha256:d777d239036815e9b3a093fa9208ad314c040c26d7246617e70e23025b60083a",
+ "sha256:e892d3177380ec080550b56a7ffeab680af25575d291766bdd875147ba246a91",
+ "sha256:e9c90a44470f2999779057aeaf33461cbd8bb59d8f15e983150d10bb260e16e0",
+ "sha256:f097dda5d4f9b9b01b3c9fa2069f9c02929365f48f341feddf3d6b32510a2f93",
+ "sha256:f4ebfe03cbae821ef994b2e58e4df6a087470cc522aca502614e82a143365d45"
],
"index": "ia",
- "version": "==0.14.1"
+ "version": "==0.19.0"
}
},
"develop": {
"astroid": {
"hashes": [
- "sha256:2f4078c2a41bf377eea06d71c9d2ba4eb8f6b1af2135bec27bbbb7d8f12bb703",
- "sha256:bc58d83eb610252fd8de6363e39d4f1d0619c894b0ed24603b881c02e64c7386"
+ "sha256:10e0ad5f7b79c435179d0d0f0df69998c4eef4597534aae44910db060baeb907",
+ "sha256:1493fe8bd3dfd73dc35bd53c9d5b6e49ead98497c47b2307662556a5692d29d7"
],
- "version": "==2.4.2"
+ "markers": "python_full_version >= '3.7.2'",
+ "version": "==2.12.13"
+ },
+ "asttokens": {
+ "hashes": [
+ "sha256:4622110b2a6f30b77e1473affaa97e711bc2f07d3f10848420ff1898edbe94f3",
+ "sha256:6b0ac9e93fb0335014d382b8fa9b3afa7df546984258005da0b9e7095b3deb1c"
+ ],
+ "version": "==2.2.1"
},
"attrs": {
"hashes": [
- "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6",
- "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"
+ "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836",
+ "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"
],
- "version": "==20.3.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==22.2.0"
},
"backcall": {
"hashes": [
@@ -1130,96 +1443,146 @@
],
"version": "==0.2.0"
},
+ "black": {
+ "hashes": [
+ "sha256:101c69b23df9b44247bd88e1d7e90154336ac4992502d4197bdac35dd7ee3320",
+ "sha256:159a46a4947f73387b4d83e87ea006dbb2337eab6c879620a3ba52699b1f4351",
+ "sha256:1f58cbe16dfe8c12b7434e50ff889fa479072096d79f0a7f25e4ab8e94cd8350",
+ "sha256:229351e5a18ca30f447bf724d007f890f97e13af070bb6ad4c0a441cd7596a2f",
+ "sha256:436cc9167dd28040ad90d3b404aec22cedf24a6e4d7de221bec2730ec0c97bcf",
+ "sha256:559c7a1ba9a006226f09e4916060982fd27334ae1998e7a38b3f33a37f7a2148",
+ "sha256:7412e75863aa5c5411886804678b7d083c7c28421210180d67dfd8cf1221e1f4",
+ "sha256:77d86c9f3db9b1bf6761244bc0b3572a546f5fe37917a044e02f3166d5aafa7d",
+ "sha256:82d9fe8fee3401e02e79767016b4907820a7dc28d70d137eb397b92ef3cc5bfc",
+ "sha256:9eedd20838bd5d75b80c9f5487dbcb06836a43833a37846cf1d8c1cc01cef59d",
+ "sha256:c116eed0efb9ff870ded8b62fe9f28dd61ef6e9ddd28d83d7d264a38417dcee2",
+ "sha256:d30b212bffeb1e252b31dd269dfae69dd17e06d92b87ad26e23890f3efea366f"
+ ],
+ "index": "ia",
+ "version": "==22.12.0"
+ },
"certifi": {
"hashes": [
- "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
- "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2020.12.5"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
- "chardet": {
+ "charset-normalizer": {
"hashes": [
- "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
- "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "version": "==4.0.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
- "coverage": {
+ "click": {
"hashes": [
- "sha256:08b3ba72bd981531fd557f67beee376d6700fba183b167857038997ba30dd297",
- "sha256:2757fa64e11ec12220968f65d086b7a29b6583d16e9a544c889b22ba98555ef1",
- "sha256:3102bb2c206700a7d28181dbe04d66b30780cde1d1c02c5f3c165cf3d2489497",
- "sha256:3498b27d8236057def41de3585f317abae235dd3a11d33e01736ffedb2ef8606",
- "sha256:378ac77af41350a8c6b8801a66021b52da8a05fd77e578b7380e876c0ce4f528",
- "sha256:38f16b1317b8dd82df67ed5daa5f5e7c959e46579840d77a67a4ceb9cef0a50b",
- "sha256:3911c2ef96e5ddc748a3c8b4702c61986628bb719b8378bf1e4a6184bbd48fe4",
- "sha256:3a3c3f8863255f3c31db3889f8055989527173ef6192a283eb6f4db3c579d830",
- "sha256:3b14b1da110ea50c8bcbadc3b82c3933974dbeea1832e814aab93ca1163cd4c1",
- "sha256:535dc1e6e68fad5355f9984d5637c33badbdc987b0c0d303ee95a6c979c9516f",
- "sha256:6f61319e33222591f885c598e3e24f6a4be3533c1d70c19e0dc59e83a71ce27d",
- "sha256:723d22d324e7997a651478e9c5a3120a0ecbc9a7e94071f7e1954562a8806cf3",
- "sha256:76b2775dda7e78680d688daabcb485dc87cf5e3184a0b3e012e1d40e38527cc8",
- "sha256:782a5c7df9f91979a7a21792e09b34a658058896628217ae6362088b123c8500",
- "sha256:7e4d159021c2029b958b2363abec4a11db0ce8cd43abb0d9ce44284cb97217e7",
- "sha256:8dacc4073c359f40fcf73aede8428c35f84639baad7e1b46fce5ab7a8a7be4bb",
- "sha256:8f33d1156241c43755137288dea619105477961cfa7e47f48dbf96bc2c30720b",
- "sha256:8ffd4b204d7de77b5dd558cdff986a8274796a1e57813ed005b33fd97e29f059",
- "sha256:93a280c9eb736a0dcca19296f3c30c720cb41a71b1f9e617f341f0a8e791a69b",
- "sha256:9a4f66259bdd6964d8cf26142733c81fb562252db74ea367d9beb4f815478e72",
- "sha256:9a9d4ff06804920388aab69c5ea8a77525cf165356db70131616acd269e19b36",
- "sha256:a2070c5affdb3a5e751f24208c5c4f3d5f008fa04d28731416e023c93b275277",
- "sha256:a4857f7e2bc6921dbd487c5c88b84f5633de3e7d416c4dc0bb70256775551a6c",
- "sha256:a607ae05b6c96057ba86c811d9c43423f35e03874ffb03fbdcd45e0637e8b631",
- "sha256:a66ca3bdf21c653e47f726ca57f46ba7fc1f260ad99ba783acc3e58e3ebdb9ff",
- "sha256:ab110c48bc3d97b4d19af41865e14531f300b482da21783fdaacd159251890e8",
- "sha256:b239711e774c8eb910e9b1ac719f02f5ae4bf35fa0420f438cdc3a7e4e7dd6ec",
- "sha256:be0416074d7f253865bb67630cf7210cbc14eb05f4099cc0f82430135aaa7a3b",
- "sha256:c46643970dff9f5c976c6512fd35768c4a3819f01f61169d8cdac3f9290903b7",
- "sha256:c5ec71fd4a43b6d84ddb88c1df94572479d9a26ef3f150cef3dacefecf888105",
- "sha256:c6e5174f8ca585755988bc278c8bb5d02d9dc2e971591ef4a1baabdf2d99589b",
- "sha256:c89b558f8a9a5a6f2cfc923c304d49f0ce629c3bd85cb442ca258ec20366394c",
- "sha256:cc44e3545d908ecf3e5773266c487ad1877be718d9dc65fc7eb6e7d14960985b",
- "sha256:cc6f8246e74dd210d7e2b56c76ceaba1cc52b025cd75dbe96eb48791e0250e98",
- "sha256:cd556c79ad665faeae28020a0ab3bda6cd47d94bec48e36970719b0b86e4dcf4",
- "sha256:ce6f3a147b4b1a8b09aae48517ae91139b1b010c5f36423fa2b866a8b23df879",
- "sha256:ceb499d2b3d1d7b7ba23abe8bf26df5f06ba8c71127f188333dddcf356b4b63f",
- "sha256:cef06fb382557f66d81d804230c11ab292d94b840b3cb7bf4450778377b592f4",
- "sha256:e448f56cfeae7b1b3b5bcd99bb377cde7c4eb1970a525c770720a352bc4c8044",
- "sha256:e52d3d95df81c8f6b2a1685aabffadf2d2d9ad97203a40f8d61e51b70f191e4e",
- "sha256:ee2f1d1c223c3d2c24e3afbb2dd38be3f03b1a8d6a83ee3d9eb8c36a52bee899",
- "sha256:f2c6888eada180814b8583c3e793f3f343a692fc802546eed45f40a001b1169f",
- "sha256:f51dbba78d68a44e99d484ca8c8f604f17e957c1ca09c3ebc2c7e3bbd9ba0448",
- "sha256:f54de00baf200b4539a5a092a759f000b5f45fd226d6d25a76b0dff71177a714",
- "sha256:fa10fee7e32213f5c7b0d6428ea92e3a3fdd6d725590238a3f92c0de1c78b9d2",
- "sha256:fabeeb121735d47d8eab8671b6b031ce08514c86b7ad8f7d5490a7b6dcd6267d",
- "sha256:fac3c432851038b3e6afe086f777732bcf7f6ebbfd90951fa04ee53db6d0bcdd",
- "sha256:fda29412a66099af6d6de0baa6bd7c52674de177ec2ad2630ca264142d69c6c7",
- "sha256:ff1330e8bc996570221b450e2d539134baa9465f5cb98aff0e0f73f34172e0ae"
+ "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e",
+ "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"
],
- "version": "==5.3.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==8.1.3"
+ },
+ "coverage": {
+ "extras": [
+ "toml"
+ ],
+ "hashes": [
+ "sha256:07bcfb1d8ac94af886b54e18a88b393f6a73d5959bb31e46644a02453c36e475",
+ "sha256:09f6b5a8415b6b3e136d5fec62b552972187265cb705097bf030eb9d4ffb9b60",
+ "sha256:0a79137fc99815fff6a852c233628e735ec15903cfd16da0f229d9c4d45926ab",
+ "sha256:0b4b3a4d9915b2be879aff6299c0a6129f3d08a775d5a061f503cf79571f73e4",
+ "sha256:1285648428a6101b5f41a18991c84f1c3959cee359e51b8375c5882fc364a13f",
+ "sha256:12a5aa77783d49e05439fbe6e6b427484f8a0f9f456b46a51d8aac022cfd024d",
+ "sha256:19ec666533f0f70a0993f88b8273057b96c07b9d26457b41863ccd021a043b9a",
+ "sha256:1e414dc32ee5c3f36544ea466b6f52f28a7af788653744b8570d0bf12ff34bc0",
+ "sha256:2c44fcfb3781b41409d0f060a4ed748537557de9362a8a9282182fafb7a76ab4",
+ "sha256:397b4a923cc7566bbc7ae2dfd0ba5a039b61d19c740f1373791f2ebd11caea59",
+ "sha256:3cfc595d2af13856505631be072835c59f1acf30028d1c860b435c5fc9c15b69",
+ "sha256:3dd4ee135e08037f458425b8842d24a95a0961831a33f89685ff86b77d378f89",
+ "sha256:486ee81fa694b4b796fc5617e376326a088f7b9729c74d9defa211813f3861e4",
+ "sha256:4f943a3b2bc520102dd3e0bb465e1286e12c9a54f58accd71b9e65324d9c7c01",
+ "sha256:63d56165a7c76265468d7e0c5548215a5ba515fc2cba5232d17df97bffa10f6c",
+ "sha256:66b18c3cf8bbab0cce0d7b9e4262dc830e93588986865a8c78ab2ae324b3ed56",
+ "sha256:691571f31ace1837838b7e421d3a09a8c00b4aac32efacb4fc9bd0a5c647d25a",
+ "sha256:6c5ad996c6fa4d8ed669cfa1e8551348729d008a2caf81489ab9ea67cfbc7498",
+ "sha256:6d55d840e1b8c0002fce66443e124e8581f30f9ead2e54fbf6709fb593181f2c",
+ "sha256:72d1507f152abacea81f65fee38e4ef3ac3c02ff8bc16f21d935fd3a8a4ad910",
+ "sha256:74f70cd92669394eaf8d7756d1b195c8032cf7bbbdfce3bc489d4e15b3b8cf73",
+ "sha256:830525361249dc4cd013652b0efad645a385707a5ae49350c894b67d23fbb07c",
+ "sha256:854f22fa361d1ff914c7efa347398374cc7d567bdafa48ac3aa22334650dfba2",
+ "sha256:89caf4425fe88889e2973a8e9a3f6f5f9bbe5dd411d7d521e86428c08a873a4a",
+ "sha256:9158f8fb06747ac17bd237930c4372336edc85b6e13bdc778e60f9d685c3ca37",
+ "sha256:92651580bd46519067e36493acb394ea0607b55b45bd81dd4e26379ed1871f55",
+ "sha256:978258fec36c154b5e250d356c59af7d4c3ba02bef4b99cda90b6029441d797d",
+ "sha256:9823e4789ab70f3ec88724bba1a203f2856331986cd893dedbe3e23a6cfc1e4e",
+ "sha256:9b373c9345c584bb4b5f5b8840df7f4ab48c4cbb7934b58d52c57020d911b856",
+ "sha256:a4a574a19eeb67575a5328a5760bbbb737faa685616586a9f9da4281f940109c",
+ "sha256:aec2d1515d9d39ff270059fd3afbb3b44e6ec5758af73caf18991807138c7118",
+ "sha256:b3695c4f4750bca943b3e1f74ad4be8d29e4aeab927d50772c41359107bd5d5c",
+ "sha256:b3763e7fcade2ff6c8e62340af9277f54336920489ceb6a8cd6cc96da52fcc62",
+ "sha256:b66bb21a23680dee0be66557dc6b02a3152ddb55edf9f6723fa4a93368f7158d",
+ "sha256:b6f22bb64cc39bcb883e5910f99a27b200fdc14cdd79df8696fa96b0005c9444",
+ "sha256:b77015d1cb8fe941be1222a5a8b4e3fbca88180cfa7e2d4a4e58aeabadef0ab7",
+ "sha256:b9ea158775c7c2d3e54530a92da79496fb3fb577c876eec761c23e028f1e216c",
+ "sha256:c20cfebcc149a4c212f6491a5f9ff56f41829cd4f607b5be71bb2d530ef243b1",
+ "sha256:cfded268092a84605f1cc19e5c737f9ce630a8900a3589e9289622db161967e9",
+ "sha256:d1991f1dd95eba69d2cd7708ff6c2bbd2426160ffc73c2b81f617a053ebcb1a8",
+ "sha256:d3022c3007d3267a880b5adcf18c2a9bf1fc64469b394a804886b401959b8742",
+ "sha256:d6814854c02cbcd9c873c0f3286a02e3ac1250625cca822ca6bc1018c5b19f1c",
+ "sha256:d87717959d4d0ee9db08a0f1d80d21eb585aafe30f9b0a54ecf779a69cb015f6",
+ "sha256:e00c14720b8b3b6c23b487e70bd406abafc976ddc50490f645166f111c419c39",
+ "sha256:e60bef2e2416f15fdc05772bf87db06c6a6f9870d1db08fdd019fbec98ae24a9",
+ "sha256:e78e9dcbf4f3853d3ae18a8f9272111242531535ec9e1009fa8ec4a2b74557dc",
+ "sha256:f66460f17c9319ea4f91c165d46840314f0a7c004720b20be58594d162a441d8",
+ "sha256:fa6a5a224b7f4cfb226f4fc55a57e8537fcc096f42219128c2c74c0e7d0953e1",
+ "sha256:fb992c47cb1e5bd6a01e97182400bcc2ba2077080a17fcd7be23aaa6e572e390",
+ "sha256:fd1b9c5adc066db699ccf7fa839189a649afcdd9e02cb5dc9d24e67e7922737d",
+ "sha256:fd556ff16a57a070ce4f31c635953cc44e25244f91a0378c6e9bdfd40fdb249f"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==7.0.1"
},
"decorator": {
"hashes": [
- "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
- "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
+ },
+ "dill": {
+ "hashes": [
+ "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0",
+ "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"
],
- "version": "==4.4.2"
+ "markers": "python_version < '3.11'",
+ "version": "==0.3.6"
+ },
+ "executing": {
+ "hashes": [
+ "sha256:0314a69e37426e3608aada02473b4161d4caf5a4b244d1d0c48072b8fee7bacc",
+ "sha256:19da64c18d2d851112f09c287f8d3dbbdf725ab0e569077efb6cdcbd3497c107"
+ ],
+ "version": "==1.2.0"
},
"flake8": {
"hashes": [
- "sha256:749dbbd6bfd0cf1318af27bf97a14e28e5ff548ef8e5b1566ccfb25a11e7c839",
- "sha256:aadae8761ec651813c24be05c6f7b4680857ef6afaae4651a4eccaef97ce6c3b"
+ "sha256:3833794e27ff64ea4e9cf5d410082a8b97ff1a06c16aa3d2027339cd0f1195c7",
+ "sha256:c61007e76655af75e6785a931f452915b371dc48f56efd765247c8fe68f2b181"
],
"index": "ia",
- "version": "==3.8.4"
+ "version": "==6.0.0"
},
"flake8-annotations": {
"hashes": [
- "sha256:0bcebb0792f1f96d617ded674dca7bf64181870bfe5dace353a1483551f8e5f1",
- "sha256:bebd11a850f6987a943ce8cdff4159767e0f5f89b3c88aca64680c2175ee02df"
+ "sha256:11f09efb99ae63c8f9d6b492b75fe147fbc323179fddfe00b2e56eefeca42f57",
+ "sha256:a4385158a7a9fc8af1d8820a2f4c8d03387997006a83f5f8bfe5bc6085bdf88a"
],
"index": "ia",
- "version": "==2.4.1"
+ "version": "==2.9.1"
},
"idna": {
"hashes": [
@@ -1237,85 +1600,104 @@
},
"ipython": {
"hashes": [
- "sha256:c987e8178ced651532b3b1ff9965925bfd445c279239697052561a9ab806d28f",
- "sha256:cbb2ef3d5961d44e6a963b9817d4ea4e1fa2eb589c371a470fed14d8d40cbd6a"
+ "sha256:352042ddcb019f7c04e48171b4dd78e4c4bb67bf97030d170e154aac42b656d9",
+ "sha256:882899fe78d5417a0aa07f995db298fa28b58faeba2112d2e3a4c95fe14bb738"
],
"index": "ia",
- "version": "==7.19.0"
- },
- "ipython-genutils": {
- "hashes": [
- "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
- "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
- ],
- "version": "==0.2.0"
+ "version": "==8.7.0"
},
"isort": {
"hashes": [
- "sha256:dcab1d98b469a12a1a624ead220584391648790275560e1a43e54c5dceae65e7",
- "sha256:dcaeec1b5f0eca77faea2a35ab790b4f3680ff75590bfcb7145986905aab2f58"
+ "sha256:6db30c5ded9815d813932c04c2f85a360bcdd35fed496f4d8f35495ef0a261b6",
+ "sha256:c033fd0edb91000a7f09527fe5c75321878f98322a77ddcc81adbd83724afb7b"
],
- "version": "==5.6.4"
+ "index": "ia",
+ "version": "==5.11.4"
},
"jedi": {
"hashes": [
- "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93",
- "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707"
+ "sha256:203c1fd9d969ab8f2119ec0a3342e0b49910045abe6af0a3ae83a5764d54639e",
+ "sha256:bae794c30d07f6d910d32a7048af09b5a39ed740918da923c6b780790ebac612"
],
- "version": "==0.18.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.18.2"
},
"lazy-object-proxy": {
"hashes": [
- "sha256:0c4b206227a8097f05c4dbdd323c50edf81f15db3b8dc064d08c62d37e1a504d",
- "sha256:194d092e6f246b906e8f70884e620e459fc54db3259e60cf69a4d66c3fda3449",
- "sha256:1be7e4c9f96948003609aa6c974ae59830a6baecc5376c25c92d7d697e684c08",
- "sha256:4677f594e474c91da97f489fea5b7daa17b5517190899cf213697e48d3902f5a",
- "sha256:48dab84ebd4831077b150572aec802f303117c8cc5c871e182447281ebf3ac50",
- "sha256:5541cada25cd173702dbd99f8e22434105456314462326f06dba3e180f203dfd",
- "sha256:59f79fef100b09564bc2df42ea2d8d21a64fdcda64979c0fa3db7bdaabaf6239",
- "sha256:8d859b89baf8ef7f8bc6b00aa20316483d67f0b1cbf422f5b4dc56701c8f2ffb",
- "sha256:9254f4358b9b541e3441b007a0ea0764b9d056afdeafc1a5569eee1cc6c1b9ea",
- "sha256:9651375199045a358eb6741df3e02a651e0330be090b3bc79f6d0de31a80ec3e",
- "sha256:97bb5884f6f1cdce0099f86b907aa41c970c3c672ac8b9c8352789e103cf3156",
- "sha256:9b15f3f4c0f35727d3a0fba4b770b3c4ebbb1fa907dbcc046a1d2799f3edd142",
- "sha256:a2238e9d1bb71a56cd710611a1614d1194dc10a175c1e08d75e1a7bcc250d442",
- "sha256:a6ae12d08c0bf9909ce12385803a543bfe99b95fe01e752536a60af2b7797c62",
- "sha256:ca0a928a3ddbc5725be2dd1cf895ec0a254798915fb3a36af0964a0a4149e3db",
- "sha256:cb2c7c57005a6804ab66f106ceb8482da55f5314b7fcb06551db1edae4ad1531",
- "sha256:d74bb8693bf9cf75ac3b47a54d716bbb1a92648d5f781fc799347cfc95952383",
- "sha256:d945239a5639b3ff35b70a88c5f2f491913eb94871780ebfabb2568bd58afc5a",
- "sha256:eba7011090323c1dadf18b3b689845fd96a61ba0a1dfbd7f24b921398affc357",
- "sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4",
- "sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0"
- ],
- "version": "==1.4.3"
+ "sha256:0c1c7c0433154bb7c54185714c6929acc0ba04ee1b167314a779b9025517eada",
+ "sha256:14010b49a2f56ec4943b6cf925f597b534ee2fe1f0738c84b3bce0c1a11ff10d",
+ "sha256:4e2d9f764f1befd8bdc97673261b8bb888764dfdbd7a4d8f55e4fbcabb8c3fb7",
+ "sha256:4fd031589121ad46e293629b39604031d354043bb5cdf83da4e93c2d7f3389fe",
+ "sha256:5b51d6f3bfeb289dfd4e95de2ecd464cd51982fe6f00e2be1d0bf94864d58acd",
+ "sha256:6850e4aeca6d0df35bb06e05c8b934ff7c533734eb51d0ceb2d63696f1e6030c",
+ "sha256:6f593f26c470a379cf7f5bc6db6b5f1722353e7bf937b8d0d0b3fba911998858",
+ "sha256:71d9ae8a82203511a6f60ca5a1b9f8ad201cac0fc75038b2dc5fa519589c9288",
+ "sha256:7e1561626c49cb394268edd00501b289053a652ed762c58e1081224c8d881cec",
+ "sha256:8f6ce2118a90efa7f62dd38c7dbfffd42f468b180287b748626293bf12ed468f",
+ "sha256:ae032743794fba4d171b5b67310d69176287b5bf82a21f588282406a79498891",
+ "sha256:afcaa24e48bb23b3be31e329deb3f1858f1f1df86aea3d70cb5c8578bfe5261c",
+ "sha256:b70d6e7a332eb0217e7872a73926ad4fdc14f846e85ad6749ad111084e76df25",
+ "sha256:c219a00245af0f6fa4e95901ed28044544f50152840c5b6a3e7b2568db34d156",
+ "sha256:ce58b2b3734c73e68f0e30e4e725264d4d6be95818ec0a0be4bb6bf9a7e79aa8",
+ "sha256:d176f392dbbdaacccf15919c77f526edf11a34aece58b55ab58539807b85436f",
+ "sha256:e20bfa6db17a39c706d24f82df8352488d2943a3b7ce7d4c22579cb89ca8896e",
+ "sha256:eac3a9a5ef13b332c059772fd40b4b1c3d45a3a2b05e33a361dee48e54a4dad0",
+ "sha256:eb329f8d8145379bf5dbe722182410fe8863d186e51bf034d2075eb8d85ee25b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==1.8.0"
+ },
+ "matplotlib-inline": {
+ "hashes": [
+ "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311",
+ "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==0.1.6"
},
"mccabe": {
"hashes": [
- "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
- "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325",
+ "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"
],
- "version": "==0.6.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.7.0"
},
"mypy": {
"hashes": [
- "sha256:0a0d102247c16ce93c97066443d11e2d36e6cc2a32d8ccc1f705268970479324",
- "sha256:0d34d6b122597d48a36d6c59e35341f410d4abfa771d96d04ae2c468dd201abc",
- "sha256:2170492030f6faa537647d29945786d297e4862765f0b4ac5930ff62e300d802",
- "sha256:2842d4fbd1b12ab422346376aad03ff5d0805b706102e475e962370f874a5122",
- "sha256:2b21ba45ad9ef2e2eb88ce4aeadd0112d0f5026418324176fd494a6824b74975",
- "sha256:72060bf64f290fb629bd4a67c707a66fd88ca26e413a91384b18db3876e57ed7",
- "sha256:af4e9ff1834e565f1baa74ccf7ae2564ae38c8df2a85b057af1dbbc958eb6666",
- "sha256:bd03b3cf666bff8d710d633d1c56ab7facbdc204d567715cb3b9f85c6e94f669",
- "sha256:c614194e01c85bb2e551c421397e49afb2872c88b5830e3554f0519f9fb1c178",
- "sha256:cf4e7bf7f1214826cf7333627cb2547c0db7e3078723227820d0a2490f117a01",
- "sha256:da56dedcd7cd502ccd3c5dddc656cb36113dd793ad466e894574125945653cea",
- "sha256:e86bdace26c5fe9cf8cb735e7cedfe7850ad92b327ac5d797c656717d2ca66de",
- "sha256:e97e9c13d67fbe524be17e4d8025d51a7dca38f90de2e462243ab8ed8a9178d1",
- "sha256:eea260feb1830a627fb526d22fbb426b750d9f5a47b624e8d5e7e004359b219c"
+ "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d",
+ "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6",
+ "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf",
+ "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f",
+ "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813",
+ "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33",
+ "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad",
+ "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05",
+ "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297",
+ "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06",
+ "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd",
+ "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243",
+ "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305",
+ "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476",
+ "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711",
+ "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70",
+ "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5",
+ "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461",
+ "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab",
+ "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c",
+ "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d",
+ "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135",
+ "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93",
+ "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648",
+ "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a",
+ "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb",
+ "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3",
+ "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372",
+ "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb",
+ "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"
],
"index": "ia",
- "version": "==0.790"
+ "version": "==0.991"
},
"mypy-extensions": {
"hashes": [
@@ -1326,17 +1708,27 @@
},
"packaging": {
"hashes": [
- "sha256:24e0da08660a87484d1602c30bb4902d74816b6985b93de36926f5bc95741858",
- "sha256:78598185a7008a470d64526a8059de9aaa449238f280fc9eb6b13ba6c4109093"
+ "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
+ "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
],
- "version": "==20.8"
+ "markers": "python_version >= '3.7'",
+ "version": "==22.0"
},
"parso": {
"hashes": [
- "sha256:15b00182f472319383252c18d5913b69269590616c947747bc50bf4ac768f410",
- "sha256:8519430ad07087d4c997fda3a7918f7cfa27cb58972a8c89c2a0295a1c940e9e"
+ "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0",
+ "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"
],
- "version": "==0.8.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
+ },
+ "pathspec": {
+ "hashes": [
+ "sha256:3c95343af8b756205e2aba76e843ba9520a24dd84f68c22b9f93251507509dd6",
+ "sha256:56200de4077d9d0791465aa9095a01d421861e405b5096955051deefd697d6f6"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.10.3"
},
"pexpect": {
"hashes": [
@@ -1353,19 +1745,29 @@
],
"version": "==0.7.5"
},
+ "platformdirs": {
+ "hashes": [
+ "sha256:1a89a12377800c81983db6be069ec068eee989748799b946cce2a6e80dcc54ca",
+ "sha256:b46ffafa316e6b83b47489d240ce17173f123a9b9c83282141c3daf26ad9ac2e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.6.0"
+ },
"pluggy": {
"hashes": [
- "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
- "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
+ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
+ "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
- "version": "==0.13.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.0.0"
},
"prompt-toolkit": {
"hashes": [
- "sha256:25c95d2ac813909f813c93fde734b6e44406d1477a9faef7c915ff37d39c0a8c",
- "sha256:7debb9a521e0b1ee7d2fe96ee4bd60ef03c6492784de0547337ca4433e46aa63"
+ "sha256:3e163f254bef5a03b146397d7c1963bd3e2812f0964bb9a24e6ec761fd28db63",
+ "sha256:aa64ad242a462c5ff0363a7b9cfe696c20d55d9fc60c11fd8e632d064804d305"
],
- "version": "==3.0.8"
+ "markers": "python_full_version >= '3.6.2'",
+ "version": "==3.0.36"
},
"ptyprocess": {
"hashes": [
@@ -1374,175 +1776,224 @@
],
"version": "==0.7.0"
},
+ "pure-eval": {
+ "hashes": [
+ "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350",
+ "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"
+ ],
+ "version": "==0.2.2"
+ },
"py": {
"hashes": [
- "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
- "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
+ "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
+ "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
- "version": "==1.10.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==1.11.0"
},
"pycodestyle": {
"hashes": [
- "sha256:2295e7b2f6b5bd100585ebcb1f616591b652db8a741695b3d8f5d28bdc934367",
- "sha256:c58a7d2815e0e8d7972bf1803331fb0152f867bd89adf8a01dfd55085434192e"
+ "sha256:347187bdb476329d98f695c213d7295a846d1152ff4fe9bacb8a9590b8ee7053",
+ "sha256:8a4eaf0d0495c7395bdab3589ac2db602797d76207242c17d470186815706610"
],
- "version": "==2.6.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.10.0"
},
"pyflakes": {
"hashes": [
- "sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92",
- "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"
+ "sha256:ec55bf7fe21fff7f1ad2f7da62363d749e2a470500eab1b555334b67aa1ef8cf",
+ "sha256:ec8b276a6b60bd80defed25add7e439881c19e64850afd9b346283d4165fd0fd"
],
- "version": "==2.2.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==3.0.1"
},
"pygments": {
"hashes": [
- "sha256:ccf3acacf3782cbed4a989426012f1c535c9a90d3a7fc3f16d231b9372d2b716",
- "sha256:f275b6c0909e5dafd2d6269a656aa90fa58ebf4a74f8fcf9053195d226b24a08"
+ "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
+ "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
- "version": "==2.7.3"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.13.0"
},
"pylint": {
"hashes": [
- "sha256:bb4a908c9dadbc3aac18860550e870f58e1a02c9f2c204fdf5693d73be061210",
- "sha256:bfe68f020f8a0fece830a22dd4d5dddb4ecc6137db04face4c3420a46a52239f"
+ "sha256:18783cca3cfee5b83c6c5d10b3cdb66c6594520ffae61890858fe8d932e1c6b4",
+ "sha256:349c8cd36aede4d50a0754a8c0218b43323d13d5d88f4b2952ddfe3e169681eb"
],
"index": "ia",
- "version": "==2.6.0"
- },
- "pyparsing": {
- "hashes": [
- "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
- "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
- ],
- "version": "==2.4.7"
+ "version": "==2.15.9"
},
"pytest": {
"hashes": [
- "sha256:1969f797a1a0dbd8ccf0fecc80262312729afea9c17f1d70ebf85c5e76c6f7c8",
- "sha256:66e419b1899bc27346cb2c993e12c5e5e8daba9073c1fbce33b9807abc95c306"
+ "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
+ "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
],
"index": "ia",
- "version": "==6.2.1"
+ "version": "==6.2.5"
},
"pytest-cov": {
"hashes": [
- "sha256:45ec2d5182f89a81fc3eb29e3d1ed3113b9e9a873bcddb2a71faaab066110191",
- "sha256:47bd0ce14056fdd79f93e1713f88fad7bdcc583dcd7783da86ef2f085a0bb88e"
+ "sha256:2feb1b751d66a8bd934e5edfa2e961d11309dc37b73b0eabe73b5945fee20f6b",
+ "sha256:996b79efde6433cdbd0088872dbc5fb3ed7fe1578b68cdbba634f14bb8dd0470"
],
"index": "ia",
- "version": "==2.10.1"
+ "version": "==4.0.0"
},
"pytest-mock": {
"hashes": [
- "sha256:c0fc979afac4aaba545cbd01e9c20736eb3fefb0a066558764b07d3de8f04ed3",
- "sha256:c3981f5edee6c4d1942250a60d9b39d38d5585398de1bfce057f925bdda720f4"
+ "sha256:f4c973eeae0282963eb293eb173ce91b091a79c1334455acfac9ddee8a1c784b",
+ "sha256:fbbdb085ef7c252a326fd8cdcac0aa3b1333d8811f131bdcc701002e1be7ed4f"
],
"index": "ia",
- "version": "==3.4.0"
+ "version": "==3.10.0"
},
"pytest-pylint": {
"hashes": [
- "sha256:790c7a8019fab08e59bd3812db1657a01995a975af8b1c6ce95b9aa39d61da27",
- "sha256:b63aaf8b80ff33c8ceaa7f68323ed04102c7790093ccf6bdb261a4c2dc6fd564"
+ "sha256:b51d3f93bed9c192e2b046f16520981bee5abe7bd61b070306e7ee685219fdd3",
+ "sha256:d88e83c1023c641548a9ec3567707ceee7616632a986af133426d4a74d066932"
],
"index": "ia",
- "version": "==0.18.0"
+ "version": "==0.19.0"
},
"pytest-pythonpath": {
"hashes": [
- "sha256:63fc546ace7d2c845c1ee289e8f7a6362c2b6bae497d10c716e58e253e801d62"
+ "sha256:64e195b23a8f8c0c631fb16882d9ad6fa4137ed1f2961ddd15d52065cd435db6",
+ "sha256:e73e11dab2f0b83e73229e261242b251f0a369d7f527dbfec068822fd26a6ce5"
],
"index": "ia",
- "version": "==0.7.3"
+ "version": "==0.7.4"
},
"requests": {
"hashes": [
- "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
- "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.25.1"
+ "version": "==2.28.1"
},
"responses": {
"hashes": [
- "sha256:2e5764325c6b624e42b428688f2111fea166af46623cb0127c05f6afb14d3457",
- "sha256:ef265bd3200bdef5ec17912fc64a23570ba23597fd54ca75c18650fa1699213d"
+ "sha256:396acb2a13d25297789a5866b4881cf4e46ffd49cc26c43ab1117f40b973102e",
+ "sha256:dcf294d204d14c436fddcc74caefdbc5764795a40ff4e6a7740ed8ddbf3294be"
],
"index": "ia",
- "version": "==0.12.1"
+ "version": "==0.22.0"
},
"six": {
"hashes": [
- "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
- "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==1.16.0"
+ },
+ "stack-data": {
+ "hashes": [
+ "sha256:32d2dd0376772d01b6cb9fc996f3c8b57a357089dec328ed4b6553d037eaf815",
+ "sha256:cbb2a53eb64e5785878201a97ed7c7b94883f48b87bfb0bbe8b623c74679e4a8"
],
- "version": "==1.15.0"
+ "version": "==0.6.2"
},
"toml": {
"hashes": [
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
"version": "==0.10.2"
},
+ "tomli": {
+ "hashes": [
+ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
+ "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
+ ],
+ "version": "==2.0.1"
+ },
+ "tomlkit": {
+ "hashes": [
+ "sha256:07de26b0d8cfc18f871aec595fda24d95b08fef89d147caa861939f37230bf4b",
+ "sha256:71b952e5721688937fb02cf9d354dbcf0785066149d2855e44531ebdd2b65d73"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.11.6"
+ },
"traitlets": {
"hashes": [
- "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396",
- "sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426"
- ],
- "version": "==5.0.5"
- },
- "typed-ast": {
- "hashes": [
- "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
- "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
- "sha256:0d8110d78a5736e16e26213114a38ca35cb15b6515d535413b090bd50951556d",
- "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
- "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
- "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
- "sha256:3742b32cf1c6ef124d57f95be609c473d7ec4c14d0090e5a5e05a15269fb4d0c",
- "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
- "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
- "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
- "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
- "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
- "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
- "sha256:7e4c9d7658aaa1fc80018593abdf8598bf91325af6af5cce4ce7c73bc45ea53d",
- "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
- "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
- "sha256:92c325624e304ebf0e025d1224b77dd4e6393f18aab8d829b5b7e04afe9b7a2c",
- "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
- "sha256:b52ccf7cfe4ce2a1064b18594381bccf4179c2ecf7f513134ec2f993dd4ab395",
- "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
- "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
- "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
- "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
- "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
- "sha256:d648b8e3bf2fe648745c8ffcee3db3ff903d0817a01a12dd6a6ea7a8f4889072",
- "sha256:f208eb7aff048f6bea9586e61af041ddf7f9ade7caed625742af423f6bae3298",
- "sha256:fac11badff8313e23717f3dada86a15389d0708275bddf766cca67a84ead3e91",
- "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
- "sha256:fcf135e17cc74dbfbc05894ebca928ffeb23d9790b3167a674921db19082401f",
- "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
- ],
- "version": "==1.4.1"
+ "sha256:6cc57d6dc28c85d5365961726ffd19b538739347749e13ebe34e03323a0e8f84",
+ "sha256:c864831efa0ba6576d09b44884b34e41defc18c0d7e720b4a2d6698c842cab3e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==5.8.0"
+ },
+ "types-beautifulsoup4": {
+ "hashes": [
+ "sha256:c1f803367a2b07ad4fdac40ddbea557010dc4ddd1ee92d801f317eb02e2e3c72",
+ "sha256:d46be8f409ddccb6daaa9d118484185e70bcf552085c39c6d05b157cd1462e04"
+ ],
+ "index": "ia",
+ "version": "==4.11.6.1"
+ },
+ "types-dateparser": {
+ "hashes": [
+ "sha256:5b0c8845167981f68f090894aa371bddbd0371341b90c3f868ac9524cd0a6b69",
+ "sha256:65232f1b3a952476fb98b31ae0a4019efd32635981040149b97b161d5ce2b4da"
+ ],
+ "index": "ia",
+ "version": "==1.1.4.4"
+ },
+ "types-pillow": {
+ "hashes": [
+ "sha256:98b8484ff343676f6f7051682a6cfd26896e993e86b3ce9badfa0ec8750f5405",
+ "sha256:c18d466dc18550d96b8b4a279ff94f0cbad696825b5ad55466604f1daf5709de"
+ ],
+ "index": "ia",
+ "version": "==9.3.0.4"
+ },
+ "types-psycopg2": {
+ "hashes": [
+ "sha256:084558d6bc4b2cfa249b06be0fdd9a14a69d307bae5bb5809a2f14cfbaa7a23f",
+ "sha256:bff045579642ce00b4a3c8f2e401b7f96dfaa34939f10be64b0dd3b53feca57d"
+ ],
+ "index": "ia",
+ "version": "==2.9.21.2"
+ },
+ "types-requests": {
+ "hashes": [
+ "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3",
+ "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"
+ ],
+ "index": "ia",
+ "version": "==2.28.11.7"
+ },
+ "types-toml": {
+ "hashes": [
+ "sha256:171bdb3163d79a520560f24ba916a9fc9bff81659c5448a9fea89240923722be",
+ "sha256:b7b5c4977f96ab7b5ac06d8a6590d17c0bf252a96efc03b109c2711fb3e0eafd"
+ ],
+ "version": "==0.10.8.1"
+ },
+ "types-urllib3": {
+ "hashes": [
+ "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49",
+ "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"
+ ],
+ "version": "==1.26.25.4"
},
"typing-extensions": {
"hashes": [
- "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
- "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
- "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
],
- "version": "==3.7.4.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"urllib3": {
"hashes": [
- "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08",
- "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version != '3.4'",
- "version": "==1.26.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"wcwidth": {
"hashes": [
@@ -1553,9 +2004,73 @@
},
"wrapt": {
"hashes": [
- "sha256:b62ffa81fb85f4332a4f609cab4ac40709470da05643a082ec1eb88e6d9b97d7"
- ],
- "version": "==1.12.1"
+ "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3",
+ "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b",
+ "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4",
+ "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2",
+ "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656",
+ "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3",
+ "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff",
+ "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310",
+ "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a",
+ "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57",
+ "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069",
+ "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383",
+ "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe",
+ "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87",
+ "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d",
+ "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b",
+ "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907",
+ "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f",
+ "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0",
+ "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28",
+ "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1",
+ "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853",
+ "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc",
+ "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3",
+ "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3",
+ "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164",
+ "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1",
+ "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c",
+ "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1",
+ "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7",
+ "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1",
+ "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320",
+ "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed",
+ "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1",
+ "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248",
+ "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c",
+ "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456",
+ "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77",
+ "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef",
+ "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1",
+ "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7",
+ "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86",
+ "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4",
+ "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d",
+ "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d",
+ "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8",
+ "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5",
+ "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471",
+ "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00",
+ "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68",
+ "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3",
+ "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d",
+ "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735",
+ "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d",
+ "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569",
+ "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7",
+ "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59",
+ "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5",
+ "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb",
+ "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b",
+ "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f",
+ "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462",
+ "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015",
+ "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"
+ ],
+ "markers": "python_version < '3.11'",
+ "version": "==1.14.1"
}
}
}
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..4395f19
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,46 @@
+
+This directory contains `sandcrawler` python code for ingest pipelines, batch
+processing, PDF extraction, etc.
+
+
+## Development Quickstart
+
+As of December 2022, working with this code requires:
+
+- Python 3.8 (specifically, due to version specification in `pipenv`)
+- `pipenv` for python dependency management
+- generic and python-specific build tools (`pkg-config`, `python-dev`, etc)
+- poppler (PDF processing library)
+- libmagic
+- libsodium
+- access to IA internal packages (`devpi.us.archive.org`), specifically for
+ globalwayback and related packages
+
+In production and CI we use Ubuntu Focal (20.04). The CI script for this
+repository (`../.gitlab-ci.yml`) is the best place to look for a complete list
+of dependencies for both development and deployment. Note that our CI system
+runs from our cluster, which resolves the devpi access issue. For developer
+laptops, you may need `sshuttle` or something similar set up to do initial
+package pulls.
+
+It is recommended to set the env variable `PIPENV_VENV_IN_PROJECT=true` when
+working with pipenv. You can include this in a `.env` file.
+
+There is a Makefile which helps with the basics. Eg:
+
+ # install deps using pipenv
+ make deps
+
+ # run python tests
+ make test
+
+ # run code formatting and lint checks
+ make fmt lint
+
+Sometimes when developing it is helpful to enter a shell with pipenv, eg:
+
+ pipenv shell
+
+Often when developing it is helpful (or necessary) to set environment
+variables. `pipenv shell` will read from `.env`, so you can copy and edit
+`example.env`, and it will be used in tests, `pipenv shell`, etc.
diff --git a/python/TODO b/python/TODO
deleted file mode 100644
index 58a463f..0000000
--- a/python/TODO
+++ /dev/null
@@ -1,7 +0,0 @@
-
-ingest crawler:
-- SPNv2 only
- - remove most SPNv1/v2 path selection
-- landing page + fulltext hops only (short recursion depth)
-- use wayback client library instead of requests to fetch content
-- https://pypi.org/project/ratelimit/
diff --git a/python/example.env b/python/example.env
index 5064c96..85af66c 100644
--- a/python/example.env
+++ b/python/example.env
@@ -5,3 +5,4 @@ IA_SECRET_KEY="dummy"
CDX_AUTH_TOKEN="dummy"
PETABOX_WEBDATA_SECRET="dummy"
SENTRY_DSN=""
+SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/"
diff --git a/python/grobid2json.py b/python/grobid2json.py
deleted file mode 100755
index a22d47d..0000000
--- a/python/grobid2json.py
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-NB: adapted to work as a library for PDF extraction. Will probably be
-re-written eventually to be correct, complete, and robust; this is just a
-first iteration.
-
-This script tries to extract everything from a GROBID TEI XML fulltext dump:
-
-- header metadata
-- affiliations
-- references (with context)
-- abstract
-- fulltext
-- tables, figures, equations
-
-A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
-
-- abstract
-- fulltext
-- tables, figures, equations
-
-Prints JSON to stdout, errors to stderr
-"""
-
-import io
-import json
-import argparse
-import xml.etree.ElementTree as ET
-from typing import List, Any, Dict, AnyStr, Optional
-
-xml_ns = "http://www.w3.org/XML/1998/namespace"
-ns = "http://www.tei-c.org/ns/1.0"
-
-
-def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]:
- if not elem:
- return []
- names = []
- for author in elem.findall(".//{%s}author" % ns):
- pn = author.find("./{%s}persName" % ns)
- if not pn:
- continue
- given_name = pn.findtext("./{%s}forename" % ns) or None
- surname = pn.findtext("./{%s}surname" % ns) or None
- full_name = " ".join(pn.itertext())
- obj: Dict[str, Any] = dict(name=full_name)
- if given_name:
- obj["given_name"] = given_name
- if surname:
- obj["surname"] = surname
- ae = author.find("./{%s}affiliation" % ns)
- if ae:
- affiliation: Dict[str, Any] = dict()
- for on in ae.findall("./{%s}orgName" % ns):
- on_type = on.get("type")
- if on_type:
- affiliation[on_type] = on.text
- addr_e = ae.find("./{%s}address" % ns)
- if addr_e:
- address = dict()
- for t in addr_e:
- address[t.tag.split("}")[-1]] = t.text
- if address:
- affiliation["address"] = address
- # affiliation['address'] = {
- # 'post_code': addr.findtext('./{%s}postCode' % ns) or None,
- # 'settlement': addr.findtext('./{%s}settlement' % ns) or None,
- # 'country': addr.findtext('./{%s}country' % ns) or None,
- # }
- obj["affiliation"] = affiliation
- names.append(obj)
- return names
-
-
-def journal_info(elem: ET.Element) -> Dict[str, Any]:
- journal = dict()
- journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
- journal["publisher"] = elem.findtext(
- ".//{%s}publicationStmt/{%s}publisher" % (ns, ns)
- )
- if journal["publisher"] == "":
- journal["publisher"] = None
- journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
- journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
- journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- keys = list(journal.keys())
-
- # remove empty/null keys
- for k in keys:
- if not journal[k]:
- journal.pop(k)
- return journal
-
-
-def biblio_info(elem: ET.Element) -> Dict[str, Any]:
- ref: Dict[str, Any] = dict()
- ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
- # Title stuff is messy in references...
- ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
- other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
- if other_title:
- if ref["title"]:
- ref["journal"] = other_title
- else:
- ref["journal"] = None
- ref["title"] = other_title
- ref["authors"] = all_authors(elem)
- ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
- if ref["publisher"] == "":
- ref["publisher"] = None
- date = elem.find('.//{%s}date[@type="published"]' % ns)
- ref["date"] = (date is not None) and date.attrib.get("when")
- ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- el = elem.find(".//{%s}ptr[@target]" % ns)
- if el is not None:
- ref["url"] = el.attrib["target"]
- # Hand correction
- if ref["url"].endswith(".Lastaccessed"):
- ref["url"] = ref["url"].replace(".Lastaccessed", "")
- else:
- ref["url"] = None
- return ref
-
-
-def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
-
- if isinstance(content, str):
- tree = ET.parse(io.StringIO(content))
- elif isinstance(content, bytes):
- tree = ET.parse(io.BytesIO(content))
-
- info: Dict[str, Any] = dict()
-
- # print(content)
- # print(content.getvalue())
- tei = tree.getroot()
-
- header = tei.find(".//{%s}teiHeader" % ns)
- if header is None:
- raise ValueError("XML does not look like TEI format")
- application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0]
- info["grobid_version"] = application_tag.attrib["version"].strip()
- info["grobid_timestamp"] = application_tag.attrib["when"].strip()
- info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
- info["authors"] = all_authors(
- header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))
- )
- info["journal"] = journal_info(header)
- date = header.find('.//{%s}date[@type="published"]' % ns)
- info["date"] = (date is not None) and date.attrib.get("when")
- info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
- info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
- if info["doi"]:
- info["doi"] = info["doi"].lower()
-
- refs = []
- for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))):
- ref = biblio_info(bs)
- ref["index"] = i
- refs.append(ref)
- info["citations"] = refs
-
- text = tei.find(".//{%s}text" % (ns))
- # print(text.attrib)
- if text and text.attrib.get("{%s}lang" % xml_ns):
- info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang
-
- if encumbered:
- el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns))
- info["abstract"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find(".//{%s}text/{%s}body" % (ns, ns))
- info["body"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
- info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
- info["annex"] = (el or None) and " ".join(el.itertext()).strip()
-
- # remove empty/null keys
- keys = list(info.keys())
- for k in keys:
- if not info[k]:
- info.pop(k)
- return info
-
-
-def main() -> None: # pragma no cover
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- description="GROBID TEI XML to JSON",
- usage="%(prog)s [options] <teifile>...",
- )
- parser.add_argument(
- "--no-encumbered",
- action="store_true",
- help="don't include ambiguously copyright encumbered fields (eg, abstract, body)",
- )
- parser.add_argument("teifiles", nargs="+")
-
- args = parser.parse_args()
-
- for filename in args.teifiles:
- content = open(filename, "r").read()
- print(
- json.dumps(
- teixml2json(content, encumbered=(not args.no_encumbered)),
- sort_keys=True,
- )
- )
-
-
-if __name__ == "__main__": # pragma no cover
- main()
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 2a1d8b5..3ffac98 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -1,21 +1,28 @@
#!/usr/bin/env python3
-
"""
These are generally for running one-off tasks from the command line. Output
might go to stdout, or might go to Kafka topic.
Example of large parallel run, locally:
- cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
"""
-import sys
-import json
import argparse
-import datetime
+import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
-from grobid2json import teixml2json
from sandcrawler import *
+from sandcrawler.grobid import CrossrefRefsWorker
+
+
+def run_single(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ resp = grobid_client.process_fulltext(blob=args.pdf_file.read())
+ resp["_metadata"] = grobid_client.metadata(resp)
+ print(json.dumps(resp, sort_keys=True))
def run_extract_json(args):
@@ -30,6 +37,7 @@ def run_extract_json(args):
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_extract_cdx(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
@@ -40,7 +48,7 @@ def run_extract_cdx(args):
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
@@ -49,10 +57,11 @@ def run_extract_cdx(args):
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_extract_zipfile(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
if args.jobs > 1:
@@ -65,6 +74,7 @@ def run_extract_zipfile(args):
pusher = ZipfilePusher(worker, args.zip_file)
pusher.run()
+
def run_transform(args):
grobid_client = GrobidClient()
for line in args.json_file:
@@ -74,62 +84,101 @@ def run_transform(args):
if args.metadata_only:
out = grobid_client.metadata(line)
else:
- out = teixml2json(line['tei_xml'])
+ tei_doc = parse_document_xml(line["tei_xml"])
+ out = tei_doc.to_legacy_dict()
if out:
- if 'source' in line:
- out['source'] = line['source']
+ if "source" in line:
+ out["source"] = line["source"]
print(json.dumps(out))
+def run_parse_crossref_refs(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ worker = CrossrefRefsWorker(grobid_client, sink=args.sink)
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
- parser.add_argument('--grobid-host',
- default="http://grobid.qa.fatcat.wiki",
- help="GROBID API host/port")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ parser.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
subparsers = parser.add_subparsers()
- sub_extract_json = subparsers.add_parser('extract-json',
- help="for each JSON line with CDX info, fetches PDF and does GROBID extraction")
+ sub_single = subparsers.add_parser("single")
+ sub_single.set_defaults(func=run_single)
+ sub_single.add_argument(
+ "pdf_file",
+ help="path to PDF file to process",
+ type=argparse.FileType("rb"),
+ )
+
+ sub_extract_json = subparsers.add_parser(
+ "extract-json",
+ help="for each JSON line with CDX info, fetches PDF and does GROBID extraction",
+ )
sub_extract_json.set_defaults(func=run_extract_json)
- sub_extract_json.add_argument('json_file',
+ sub_extract_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_cdx = subparsers.add_parser('extract-cdx',
- help="for each CDX line, fetches PDF and does GROBID extraction")
+ sub_extract_cdx = subparsers.add_parser(
+ "extract-cdx", help="for each CDX line, fetches PDF and does GROBID extraction"
+ )
sub_extract_cdx.set_defaults(func=run_extract_cdx)
- sub_extract_cdx.add_argument('cdx_file',
+ sub_extract_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
- help="opens zipfile, iterates over PDF files inside and does GROBID extract for each")
+ sub_extract_zipfile = subparsers.add_parser(
+ "extract-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does GROBID extract for each",
+ )
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
- sub_extract_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to extract",
- type=str)
-
- sub_transform = subparsers.add_parser('transform')
+ sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
+
+ sub_parse_crossref_refs = subparsers.add_parser(
+ "parse-crossref-refs",
+ help="reads Crossref metadata records, parses any unstructured refs with GROBID",
+ )
+ sub_parse_crossref_refs.set_defaults(func=run_parse_crossref_refs)
+ sub_parse_crossref_refs.add_argument(
+ "json_file",
+ help="JSON-L file to process (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_transform = subparsers.add_parser("transform")
sub_transform.set_defaults(func=run_transform)
- sub_transform.add_argument('--metadata-only',
- action='store_true',
- help="Only pass through bibliographic metadata, not fulltext")
- sub_transform.add_argument('json_file',
+ sub_transform.add_argument(
+ "--metadata-only",
+ action="store_true",
+ help="Only pass through bibliographic metadata, not fulltext",
+ )
+ sub_transform.add_argument(
+ "json_file",
help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -140,10 +189,10 @@ def main():
if args.kafka_mode:
produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
- args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
- produce_topic=produce_topic)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index 20c65bb..493c9e7 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -1,8 +1,7 @@
#!/usr/bin/env python3
-
"""
Input is IA item metadata JSON.
-Ouput is insertable fatcat "match" JSON
+Output is insertable fatcat "match" JSON
- md5
- sha1
@@ -22,87 +21,93 @@ When invoking import matched, be sure to:
--default-mimetype application/pdf
"""
-import sys
import json
+import sys
+from typing import Any, Dict, Optional
-def parse(obj):
- if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
- print('skip: test item', file=sys.stderr)
+
+def parse(obj: dict) -> Optional[Dict[str, Any]]:
+ if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"):
+ print("skip: test item", file=sys.stderr)
return None
extid_type = None
extid = None
- if obj['metadata']['identifier'].startswith('arxiv-'):
- extid_type = 'arxiv'
- extid = obj['metadata'].get('source')
+ if obj["metadata"]["identifier"].startswith("arxiv-"):
+ extid_type = "arxiv"
+ extid = obj["metadata"].get("source")
if not extid:
- print('skip: no source', file=sys.stderr)
+ print("skip: no source", file=sys.stderr)
return None
- assert extid.startswith('http://arxiv.org/abs/')
- extid = extid.replace('http://arxiv.org/abs/', '')
- #print(extid)
- assert '/' in extid or '.' in extid
- if not 'v' in extid or not extid[-1].isdigit():
- print('skip: non-versioned arxiv_id', file=sys.stderr)
+ assert extid.startswith("http://arxiv.org/abs/")
+ extid = extid.replace("http://arxiv.org/abs/", "")
+ # print(extid)
+ assert "/" in extid or "." in extid
+ if "v" not in extid or not extid[-1].isdigit():
+ print("skip: non-versioned arxiv_id", file=sys.stderr)
return None
- elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
- extid_type = 'doi'
- extid = obj['metadata']['identifier-doi']
+ elif obj["metadata"]["identifier"].startswith("paper-doi-10_"):
+ extid_type = "doi"
+ extid = obj["metadata"]["identifier-doi"]
assert extid.startswith("10.")
- elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
- extid_type = 'pmcid'
- extid = obj['metadata']['identifier'].replace('pubmed-', '')
+ elif obj["metadata"]["identifier"].startswith("pubmed-PMC"):
+ extid_type = "pmcid"
+ extid = obj["metadata"]["identifier"].replace("pubmed-", "")
assert extid.startswith("PMC")
int(extid[3:])
- elif obj['metadata']['identifier'].startswith('jstor-'):
- extid_type = 'jstor'
- extid = obj['metadata']['identifier'].replace('jstor-', '')
+ elif obj["metadata"]["identifier"].startswith("jstor-"):
+ extid_type = "jstor"
+ extid = obj["metadata"]["identifier"].replace("jstor-", "")
int(extid)
else:
raise NotImplementedError()
pdf_file = None
- for f in obj['files']:
- if f['source'] == "original" and "PDF" in f['format']:
+ for f in obj["files"]:
+ if f["source"] == "original" and "PDF" in f["format"]:
pdf_file = f
break
if not pdf_file:
- print('skip: no PDF found: {}'.format(obj['metadata']['identifier']), file=sys.stderr)
- #for f in obj['files']:
+ print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr)
+ # for f in obj['files']:
# print(f['format'], file=sys.stderr)
return None
- assert pdf_file['name'].endswith('.pdf')
+ assert pdf_file["name"].endswith(".pdf")
match = {
- 'md5': pdf_file['md5'],
- 'sha1': pdf_file['sha1'],
- 'size': int(pdf_file['size']),
- 'mimetype': 'application/pdf',
- 'urls': [
+ "md5": pdf_file["md5"],
+ "sha1": pdf_file["sha1"],
+ "size": int(pdf_file["size"]),
+ "mimetype": "application/pdf",
+ "urls": [
"https://archive.org/download/{}/{}".format(
- obj['metadata']['identifier'],
- pdf_file['name']),
+ obj["metadata"]["identifier"], pdf_file["name"]
+ ),
],
- 'cdx': [],
- 'dois': [],
+ "cdx": [],
+ "dois": [],
}
- if extid_type == 'doi':
- match['dois'] = [extid,]
+ if extid_type == "doi":
+ match["dois"] = [
+ extid,
+ ]
else:
match[extid_type] = extid
return match
-def run():
+
+def run() -> None:
for line in sys.stdin:
if not line:
continue
obj = json.loads(line)
match = parse(obj)
- if match:
+ if match is not None:
print(json.dumps(match, sort_keys=True))
-if __name__ == '__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/ingest_file.py b/python/ingest_file.py
deleted file mode 100755
index 20b6d67..0000000
--- a/python/ingest_file.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-import argparse
-
-from http.server import HTTPServer
-from sandcrawler.ingest import IngestFileRequestHandler, IngestFileWorker
-
-
-def run_single_ingest(args):
- request = dict(
- ingest_type=args.ingest_type,
- base_url=args.url,
- ext_ids=dict(doi=args.doi),
- fatcat=dict(release_ident=args.release_id),
- )
- if args.force_recrawl:
- request['force_recrawl'] = True
- ingester = IngestFileWorker(
- try_spn2=not args.no_spn2,
- html_quick_mode=args.html_quick_mode,
- )
- result = ingester.process(request)
- print(json.dumps(result, sort_keys=True))
- return result
-
-def run_requests(args):
- # TODO: switch to using JsonLinePusher
- ingester = IngestFileWorker(
- try_spn2=not args.no_spn2,
- html_quick_mode=args.html_quick_mode,
- )
- for l in args.json_file:
- request = json.loads(l.strip())
- result = ingester.process(request)
- print(json.dumps(result, sort_keys=True))
-
-def run_api(args):
- port = 8083
- print("Listening on localhost:{}".format(port))
- server = HTTPServer(('', port), IngestFileRequestHandler)
- server.serve_forever()
-
-def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- subparsers = parser.add_subparsers()
-
- sub_single= subparsers.add_parser('single',
- help="ingests a single file URL")
- sub_single.set_defaults(func=run_single_ingest)
- sub_single.add_argument('--release-id',
- help="(optional) existing release ident to match to")
- sub_single.add_argument('--doi',
- help="(optional) existing release DOI to match to")
- sub_single.add_argument('--force-recrawl',
- action='store_true',
- help="ignore GWB history and use SPNv2 to re-crawl")
- sub_single.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
- sub_single.add_argument('--ingest-type',
- default="pdf",
- help="type of ingest (pdf, html, etc)")
- sub_single.add_argument('--html-quick-mode',
- action='store_true',
- help="don't fetch individual sub-resources, just use CDX")
- sub_single.add_argument('url',
- help="URL of paper to fetch")
-
- sub_requests = subparsers.add_parser('requests',
- help="takes a series of ingest requests (JSON, per line) and runs each")
- sub_requests.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
- sub_requests.add_argument('--html-quick-mode',
- action='store_true',
- help="don't fetch individual sub-resources, just use CDX")
- sub_requests.set_defaults(func=run_requests)
- sub_requests.add_argument('json_file',
- help="JSON file (request per line) to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
-
- sub_api = subparsers.add_parser('api',
- help="starts a simple HTTP server that processes ingest requests")
- sub_api.set_defaults(func=run_api)
- sub_api.add_argument('--port',
- help="HTTP port to listen on",
- default=8033, type=int)
-
- args = parser.parse_args()
- if not args.__dict__.get("func"):
- parser.print_help(file=sys.stderr)
- sys.exit(-1)
-
- args.func(args)
-
-if __name__ == '__main__':
- main()
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
new file mode 100755
index 0000000..0b74f9f
--- /dev/null
+++ b/python/ingest_tool.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import subprocess
+import sys
+from http.server import HTTPServer
+
+import sentry_sdk
+
+from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink
+from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
+from sandcrawler.ingest_fileset import IngestFilesetWorker
+
+
+def run_single_ingest(args):
+ request = dict(
+ ingest_type=args.ingest_type,
+ base_url=args.url,
+ ext_ids=dict(doi=args.doi),
+ fatcat=dict(release_ident=args.release_id),
+ )
+ if args.force_recrawl:
+ request["force_recrawl"] = True
+ if request["ingest_type"] in [
+ "dataset",
+ ]:
+ ingester = IngestFilesetWorker(
+ try_spn2=not args.no_spn2,
+ ingest_file_result_stdout=True,
+ )
+ else:
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ grobid_client=grobid_client,
+ )
+ result = ingester.process(request)
+ print(json.dumps(result, sort_keys=True))
+ return result
+
+
+def run_requests(args):
+ # TODO: switch to using JsonLinePusher
+ file_worker = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ )
+ fileset_worker = IngestFilesetWorker(
+ try_spn2=not args.no_spn2,
+ )
+ for line in args.json_file:
+ request = json.loads(line.strip())
+ if request["ingest_type"] in [
+ "dataset",
+ ]:
+ result = fileset_worker.process(request)
+ else:
+ result = file_worker.process(request)
+ print(json.dumps(result, sort_keys=True))
+
+
+def run_file_requests_backfill(args):
+ """
+ Special mode for persisting GROBID and pdfextract results to Kafka, but
+ printing ingest result to stdout.
+
+ Can be used to batch re-process known files.
+ """
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
+ grobid_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=grobid_topic,
+ )
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ xmldoc_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=xmldoc_topic,
+ )
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
+ worker = IngestFileWorker(
+ grobid_client=grobid_client,
+ sink=None,
+ grobid_sink=grobid_sink,
+ thumbnail_sink=thumbnail_sink,
+ pdftext_sink=pdftext_sink,
+ xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
+ try_spn2=False,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ )
+ pusher.run()
+
+
+def run_spn_status(args):
+ worker = IngestFileWorker(
+ sink=None,
+ try_spn2=False,
+ )
+
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/system")
+ resp.raise_for_status()
+ print(f"System status: {json.dumps(resp.json(), sort_keys=True)}")
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/user")
+ resp.raise_for_status()
+ print(f"User status: {json.dumps(resp.json(), sort_keys=True)}")
+
+
+def run_api(args):
+ port = 8083
+ print("Listening on localhost:{}".format(port))
+ server = HTTPServer(("", port), IngestFileRequestHandler)
+ server.serve_forever()
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--enable-sentry",
+ action="store_true",
+ help="report exceptions to Sentry",
+ )
+ parser.add_argument("--env", default="dev", help="environment (eg, prod, qa, dev)")
+ subparsers = parser.add_subparsers()
+
+ sub_single = subparsers.add_parser("single", help="ingests a single base URL")
+ sub_single.set_defaults(func=run_single_ingest)
+ sub_single.add_argument(
+ "ingest_type", default="pdf", help="type of ingest (pdf, html, etc)"
+ )
+ sub_single.add_argument(
+ "--release-id", help="(optional) existing release ident to match to"
+ )
+ sub_single.add_argument("--doi", help="(optional) existing release DOI to match to")
+ sub_single.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="ignore GWB history and use SPNv2 to re-crawl",
+ )
+ sub_single.add_argument("--no-spn2", action="store_true", help="don't use live web (SPNv2)")
+ sub_single.add_argument(
+ "--html-quick-mode",
+ action="store_true",
+ help="don't fetch individual sub-resources, just use CDX",
+ )
+ sub_single.add_argument("url", help="URL of paper to fetch")
+ sub_single.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
+ sub_requests = subparsers.add_parser(
+ "requests", help="takes a series of ingest requests (JSON, per line) and runs each"
+ )
+ sub_requests.add_argument(
+ "--no-spn2", action="store_true", help="don't use live web (SPNv2)"
+ )
+ sub_requests.add_argument(
+ "--html-quick-mode",
+ action="store_true",
+ help="don't fetch individual sub-resources, just use CDX",
+ )
+ sub_requests.set_defaults(func=run_requests)
+ sub_requests.add_argument(
+ "json_file",
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ sub_api = subparsers.add_parser(
+ "api", help="starts a simple HTTP server that processes ingest requests"
+ )
+ sub_api.set_defaults(func=run_api)
+ sub_api.add_argument("--port", help="HTTP port to listen on", default=8033, type=int)
+
+ sub_file_requests_backfill = subparsers.add_parser(
+ "file-requests-backfill",
+ help="starts a simple HTTP server that processes ingest requests",
+ )
+ sub_file_requests_backfill.set_defaults(func=run_file_requests_backfill)
+ sub_file_requests_backfill.add_argument(
+ "json_file",
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_file_requests_backfill.add_argument(
+ "--kafka-hosts",
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use",
+ )
+ sub_file_requests_backfill.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
+ sub_spn_status = subparsers.add_parser(
+ "spn-status", help="checks save-page-now v2 API status for bot user"
+ )
+ sub_spn_status.set_defaults(func=run_spn_status)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ # configure sentry *after* parsing args
+ if args.enable_sentry:
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index 10a0f48..28d6397 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -1,15 +1,11 @@
#!/usr/bin/env python3
-
"""
KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
"""
-import sys
-import json
import argparse
-import datetime
+import sys
-from grobid2json import teixml2json
from sandcrawler import *
@@ -20,10 +16,13 @@ def run_extract_json(args):
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
else:
- worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ worker = PdfExtractWorker(
+ wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink
+ )
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_extract_cdx(args):
wayback_client = WaybackClient()
if args.jobs > 1:
@@ -33,19 +32,22 @@ def run_extract_cdx(args):
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
- worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ worker = PdfExtractWorker(
+ wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink
+ )
pusher = CdxLinePusher(
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_extract_zipfile(args):
if args.jobs > 1:
print("multi-processing: {}".format(args.jobs), file=sys.stderr)
@@ -57,9 +59,10 @@ def run_extract_zipfile(args):
pusher = ZipfilePusher(worker, args.zip_file)
pusher.run()
+
def run_single(args):
worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
- with open(args.pdf_file, 'rb') as pdf_file:
+ with open(args.pdf_file, "rb") as pdf_file:
pdf_bytes = pdf_file.read()
worker.push_record(pdf_bytes)
worker.finish()
@@ -67,51 +70,55 @@ def run_single(args):
args.thumbnail_sink.finish()
-
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
subparsers = parser.add_subparsers()
- sub_extract_json = subparsers.add_parser('extract-json',
- help="for each JSON line with CDX info, fetches PDF and does PDF extraction")
+ sub_extract_json = subparsers.add_parser(
+ "extract-json",
+ help="for each JSON line with CDX info, fetches PDF and does PDF extraction",
+ )
sub_extract_json.set_defaults(func=run_extract_json)
- sub_extract_json.add_argument('json_file',
+ sub_extract_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_cdx = subparsers.add_parser('extract-cdx',
- help="for each CDX line, fetches PDF and does PDF extraction")
+ sub_extract_cdx = subparsers.add_parser(
+ "extract-cdx", help="for each CDX line, fetches PDF and does PDF extraction"
+ )
sub_extract_cdx.set_defaults(func=run_extract_cdx)
- sub_extract_cdx.add_argument('cdx_file',
+ sub_extract_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
- help="opens zipfile, iterates over PDF files inside and does PDF extract for each")
+ sub_extract_zipfile = subparsers.add_parser(
+ "extract-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does PDF extract for each",
+ )
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
- sub_extract_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to extract",
- type=str)
+ sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
- sub_single = subparsers.add_parser('single',
- help="opens single PDF and extracts it")
+ sub_single = subparsers.add_parser("single", help="opens single PDF and extracts it")
sub_single.set_defaults(func=run_single)
- sub_single.add_argument('pdf_file',
- help="single PDF to extract",
- type=str)
+ sub_single.add_argument("pdf_file", help="single PDF to extract", type=str)
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -123,17 +130,22 @@ def main():
if args.kafka_mode:
text_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env)
thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env)
- args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
- produce_topic=text_topic)
- args.thumbnail_sink = KafkaSink(kafka_hosts=args.kafka_hosts,
- produce_topic=thumbnail_topic)
- print("Running in kafka output mode, publishing to {} and {}\n".format(
- text_topic, thumbnail_topic), file=sys.stderr)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=text_topic)
+ args.thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts, produce_topic=thumbnail_topic
+ )
+ print(
+ "Running in kafka output mode, publishing to {} and {}\n".format(
+ text_topic, thumbnail_topic
+ ),
+ file=sys.stderr,
+ )
else:
args.sink = None
args.thumbnail_sink = None
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 5cffa8c..24b749d 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -1,18 +1,15 @@
#!/usr/bin/env python3
-
"""
Basically just a copy of grobid_tool.py, but for PDF classification instead of
text extraction.
Example of large parallel run, locally:
-cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
"""
-import sys
-import json
import argparse
-import datetime
+import sys
from sandcrawler import *
@@ -21,37 +18,47 @@ def run_classify_pdf_json(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
wayback_client = WaybackClient()
if args.jobs > 1:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode
+ )
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
else:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode
+ )
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_classify_pdf_cdx(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
wayback_client = WaybackClient()
if args.jobs > 1:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode
+ )
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = CdxLinePusher(
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode
+ )
pusher = CdxLinePusher(
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_classify_pdf_zipfile(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
worker = PdfTrioBlobWorker(pdftrio_client, sink=args.sink, mode=args.pdftrio_mode)
@@ -60,48 +67,59 @@ def run_classify_pdf_zipfile(args):
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
- parser.add_argument('--pdftrio-host',
- default="http://pdftrio.qa.fatcat.wiki",
- help="pdftrio API host/port")
- parser.add_argument('--pdftrio-mode',
- default="auto",
- help="which classification mode to use")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ parser.add_argument(
+ "--pdftrio-host", default="http://pdftrio.qa.fatcat.wiki", help="pdftrio API host/port"
+ )
+ parser.add_argument(
+ "--pdftrio-mode", default="auto", help="which classification mode to use"
+ )
subparsers = parser.add_subparsers()
- sub_classify_pdf_json = subparsers.add_parser('classify-pdf-json',
- help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion")
+ sub_classify_pdf_json = subparsers.add_parser(
+ "classify-pdf-json",
+ help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion",
+ )
sub_classify_pdf_json.set_defaults(func=run_classify_pdf_json)
- sub_classify_pdf_json.add_argument('json_file',
+ sub_classify_pdf_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_classify_pdf_cdx = subparsers.add_parser('classify-pdf-cdx',
- help="for each CDX line, fetches PDF and does pdftrio classify_pdfion")
+ sub_classify_pdf_cdx = subparsers.add_parser(
+ "classify-pdf-cdx",
+ help="for each CDX line, fetches PDF and does pdftrio classify_pdfion",
+ )
sub_classify_pdf_cdx.set_defaults(func=run_classify_pdf_cdx)
- sub_classify_pdf_cdx.add_argument('cdx_file',
+ sub_classify_pdf_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_classify_pdf_zipfile = subparsers.add_parser('classify-pdf-zipfile',
- help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each")
+ sub_classify_pdf_zipfile = subparsers.add_parser(
+ "classify-pdf-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each",
+ )
sub_classify_pdf_zipfile.set_defaults(func=run_classify_pdf_zipfile)
- sub_classify_pdf_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to classify",
- type=str)
+ sub_classify_pdf_zipfile.add_argument(
+ "zip_file", help="zipfile with PDFs to classify", type=str
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -112,10 +130,10 @@ def main():
if args.kafka_mode:
produce_topic = "sandcrawler-{}.pdftrio-output".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
- args.sink = KafkaSink(kafka_hosts=args.kafka_hosts,
- produce_topic=produce_topic)
+ args.sink = KafkaSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 69e9374..e08d66c 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
@@ -7,9 +6,9 @@ Normally this is done by workers (in sandcrawler_worker.py) consuming from
Kafka feeds, but sometimes we have bulk processing output we want to backfill.
"""
+import argparse
import os
import sys
-import argparse
from sandcrawler import *
from sandcrawler.persist import *
@@ -19,7 +18,7 @@ def run_cdx(args):
worker = PersistCdxWorker(
db_url=args.db_url,
)
- filter_mimetypes = ['application/pdf']
+ filter_mimetypes = ["application/pdf"]
if args.no_mimetype_filter:
filter_mimetypes = None
pusher = CdxLinePusher(
@@ -27,11 +26,12 @@ def run_cdx(args):
args.cdx_file,
filter_http_statuses=[200, 226],
filter_mimetypes=filter_mimetypes,
- #allow_octet_stream
+ # allow_octet_stream
batch_size=200,
)
pusher.run()
+
def run_grobid(args):
worker = PersistGrobidWorker(
db_url=args.db_url,
@@ -49,6 +49,7 @@ def run_grobid(args):
)
pusher.run()
+
def run_grobid_disk(args):
"""
Writes XML to individual files on disk, and also prints non-XML metadata to
@@ -63,6 +64,7 @@ def run_grobid_disk(args):
)
pusher.run()
+
def run_pdftrio(args):
worker = PersistPdfTrioWorker(
db_url=args.db_url,
@@ -74,6 +76,7 @@ def run_pdftrio(args):
)
pusher.run()
+
def run_pdftext(args):
worker = PersistPdfTextWorker(
db_url=args.db_url,
@@ -91,6 +94,7 @@ def run_pdftext(args):
)
pusher.run()
+
def run_ingest_file_result(args):
worker = PersistIngestFileResultWorker(
db_url=args.db_url,
@@ -102,6 +106,7 @@ def run_ingest_file_result(args):
)
pusher.run()
+
def run_ingest_request(args):
worker = PersistIngestRequestWorker(
db_url=args.db_url,
@@ -113,92 +118,186 @@ def run_ingest_request(args):
)
pusher.run()
+
+def run_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=batch_size,
+ )
+ pusher.run()
+
+
+def run_grobid_refs(args):
+ worker = PersistGrobidRefsWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=100,
+ )
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--db-url',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--db-url",
help="postgresql database connection string",
- default="postgres:///sandcrawler")
- parser.add_argument('--s3-url',
- help="S3 (seaweedfs) backend URL",
- default="localhost:9000")
- parser.add_argument('--s3-access-key',
+ default="postgres:///sandcrawler",
+ )
+ parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000")
+ parser.add_argument(
+ "--s3-access-key",
help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
- parser.add_argument('--s3-secret-key',
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_ACCESS_KEY"),
+ )
+ parser.add_argument(
+ "--s3-secret-key",
help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))
- parser.add_argument('--s3-bucket',
- help="S3 (seaweedfs) bucket to persist into",
- default="sandcrawler-dev")
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_SECRET_KEY"),
+ )
+ parser.add_argument(
+ "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev"
+ )
subparsers = parser.add_subparsers()
- sub_cdx = subparsers.add_parser('cdx',
- help="backfill a CDX file into postgresql cdx table")
+ sub_cdx = subparsers.add_parser("cdx", help="backfill a CDX file into postgresql cdx table")
sub_cdx.set_defaults(func=run_cdx)
- sub_cdx.add_argument('cdx_file',
+ sub_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_cdx.add_argument('--no-mimetype-filter',
- action='store_true',
- help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
+ type=argparse.FileType("r"),
+ )
+ sub_cdx.add_argument(
+ "--no-mimetype-filter",
+ action="store_true",
+ help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)",
+ )
- sub_grobid = subparsers.add_parser('grobid',
- help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ sub_grobid = subparsers.add_parser(
+ "grobid", help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_grobid.set_defaults(func=run_grobid)
- sub_grobid.add_argument('json_file',
+ sub_grobid.add_argument(
+ "json_file",
help="grobid file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_grobid.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_grobid.add_argument('--db-only',
- action='store_true',
- help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
-
- sub_pdftext = subparsers.add_parser('pdftext',
- help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ type=argparse.FileType("r"),
+ )
+ sub_grobid.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_grobid.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)",
+ )
+
+ sub_pdftext = subparsers.add_parser(
+ "pdftext", help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_pdftext.set_defaults(func=run_pdftext)
- sub_pdftext.add_argument('json_file',
+ sub_pdftext.add_argument(
+ "json_file",
help="pdftext file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_pdftext.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_pdftext.add_argument('--db-only',
- action='store_true',
- help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
-
- sub_grobid_disk = subparsers.add_parser('grobid-disk',
- help="dump GRBOID output to (local) files on disk")
+ type=argparse.FileType("r"),
+ )
+ sub_pdftext.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_pdftext.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)",
+ )
+
+ sub_grobid_disk = subparsers.add_parser(
+ "grobid-disk", help="dump GRBOID output to (local) files on disk"
+ )
sub_grobid_disk.set_defaults(func=run_grobid_disk)
- sub_grobid_disk.add_argument('json_file',
+ sub_grobid_disk.add_argument(
+ "json_file",
help="grobid file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_grobid_disk.add_argument('output_dir',
- help="base directory to output into",
- type=str)
+ type=argparse.FileType("r"),
+ )
+ sub_grobid_disk.add_argument("output_dir", help="base directory to output into", type=str)
- sub_pdftrio = subparsers.add_parser('pdftrio',
- help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ sub_pdftrio = subparsers.add_parser(
+ "pdftrio", help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_pdftrio.set_defaults(func=run_pdftrio)
- sub_pdftrio.add_argument('json_file',
+ sub_pdftrio.add_argument(
+ "json_file",
help="pdftrio file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_ingest_file_result = subparsers.add_parser('ingest-file-result',
- help="backfill a ingest_file_result JSON dump into postgresql")
+ sub_ingest_file_result = subparsers.add_parser(
+ "ingest-file-result", help="backfill a ingest_file_result JSON dump into postgresql"
+ )
sub_ingest_file_result.set_defaults(func=run_ingest_file_result)
- sub_ingest_file_result.add_argument('json_file',
+ sub_ingest_file_result.add_argument(
+ "json_file",
help="ingest_file_result file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_ingest_request = subparsers.add_parser('ingest-request',
- help="backfill a ingest_request JSON dump into postgresql")
+ sub_ingest_request = subparsers.add_parser(
+ "ingest-request", help="backfill a ingest_request JSON dump into postgresql"
+ )
sub_ingest_request.set_defaults(func=run_ingest_request)
- sub_ingest_request.add_argument('json_file',
+ sub_ingest_request.add_argument(
+ "json_file",
help="ingest_request to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
+
+ sub_crossref = subparsers.add_parser(
+ "crossref",
+ help="backfill a crossref JSON dump into postgresql, and extract references at the same time",
+ )
+ sub_crossref.set_defaults(func=run_crossref)
+ sub_crossref.add_argument(
+ "json_file",
+ help="crossref file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ sub_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
+
+ sub_grobid_refs = subparsers.add_parser(
+ "grobid-refs", help="backfill a grobid_refs JSON dump into postgresql"
+ )
+ sub_grobid_refs.set_defaults(func=run_grobid_refs)
+ sub_grobid_refs.add_argument(
+ "json_file",
+ help="grobid_refs to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -207,5 +306,6 @@ def main():
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..2cef007
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta:__legacy__"
+
+[tool.isort]
+profile = "black"
+line_length = 96
diff --git a/python/pytest.ini b/python/pytest.ini
index 034a68e..18e8cf0 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -17,5 +17,10 @@ filterwarnings =
ignore::DeprecationWarning:.*wayback
ignore::DeprecationWarning:.*PIL
ignore::DeprecationWarning:.*justext
+ ignore::DeprecationWarning:.*internetarchive
+ ignore::DeprecationWarning:.*minio
+ ignore::DeprecationWarning:.*base_reporter
+ ignore::DeprecationWarning:.*loccache
+ ignore:.*pytz-deprecation-shim
log_level = INFO
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index e461462..469c2a2 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,10 +1,49 @@
-
-from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
-from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
-from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
-from .ingest import IngestFileWorker
-from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
-from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
+from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
+from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
+from .ia import (
+ CdxApiClient,
+ CdxApiError,
+ CdxPartial,
+ CdxRow,
+ PetaboxError,
+ ResourceResult,
+ SavePageNowBackoffError,
+ SavePageNowClient,
+ SavePageNowError,
+ WarcResource,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+)
+from .ingest_file import IngestFileWorker
+from .ingest_fileset import IngestFilesetWorker
+from .misc import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_datetime,
+ parse_cdx_line,
+)
+from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
+from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
+from .persist import (
+ PersistCdxWorker,
+ PersistGrobidDiskWorker,
+ PersistGrobidWorker,
+ PersistIngestFileResultWorker,
+ PersistIngestRequestWorker,
+ PersistPdfTextWorker,
+ PersistPdfTrioWorker,
+ PersistThumbnailWorker,
+)
+from .workers import (
+ BlackholeSink,
+ CdxLinePusher,
+ JsonLinePusher,
+ KafkaCompressSink,
+ KafkaJsonPusher,
+ KafkaSink,
+ MultiprocessWrapper,
+ ZipfilePusher,
+)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 066e53b..f9018ec 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,51 +1,58 @@
-
-import json
import datetime
-from typing import Optional
+import json
+from typing import Any, Dict, List, Optional, Tuple
import psycopg2
import psycopg2.extras
-import requests
-class SandcrawlerPostgrestClient:
+from .misc import requests_retry_session
- def __init__(self, api_url="http://aitio.us.archive.org:3030", **kwargs):
+
+class SandcrawlerPostgrestClient:
+ def __init__(self, api_url: str = "http://wbgrp-svc506.us.archive.org:3030", **kwargs):
self.api_url = api_url
+ self.http_session = requests_retry_session()
- def get_cdx(self, url):
- resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url))
+ def get_cdx(self, url: str) -> Optional[dict]:
+ resp = self.http_session.get(self.api_url + "/cdx", params=dict(url="eq." + url))
resp.raise_for_status()
return resp.json() or None
- def get_grobid(self, sha1):
- resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
+ def get_grobid(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
- def get_pdftrio(self, sha1):
- resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex='eq.'+sha1))
+ def get_pdftrio(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/pdftrio", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
- def get_pdf_meta(self, sha1):
- resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex='eq.'+sha1))
+ def get_pdf_meta(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
def get_html_meta(self, sha1hex: str) -> Optional[dict]:
- resp = requests.get(
+ resp = self.http_session.get(
self.api_url + "/html_meta",
params=dict(sha1hex=f"eq.{sha1hex}"),
)
@@ -56,17 +63,19 @@ class SandcrawlerPostgrestClient:
else:
return None
- def get_file_meta(self, sha1):
- resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))
+ def get_file_meta(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/file_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]:
- resp = requests.get(
+ resp = self.http_session.get(
self.api_url + "/ingest_file_result",
params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
)
@@ -77,27 +86,76 @@ class SandcrawlerPostgrestClient:
else:
return None
-class SandcrawlerPostgresClient:
+ def get_ingest_fileset_platform(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/ingest_fileset_platform",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref(self, doi: str) -> Optional[dict]:
+ resp = self.http_session.get(self.api_url + "/crossref", params=dict(doi=f"eq.{doi}"))
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref_with_refs(self, doi: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/crossref_with_refs", params=dict(doi=f"eq.{doi}")
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
- def __init__(self, db_url, **kwargs):
+ def get_grobid_refs(self, source: str, source_id: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/grobid_refs",
+ params=dict(source=f"eq.{source}", source_id=f"eq.{source_id}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+
+class SandcrawlerPostgresClient:
+ def __init__(self, db_url: str, **kwargs):
self.conn = psycopg2.connect(db_url)
- def cursor(self):
+ def cursor(self) -> psycopg2.extensions.cursor:
return self.conn.cursor()
- def commit(self):
- return self.conn.commit()
+ def commit(self) -> None:
+ self.conn.commit()
- def _inserts_and_updates(self, resp, on_conflict):
- resp = [int(r[0]) for r in resp]
- inserts = len([r for r in resp if r == 0])
+ def _inserts_and_updates(self, resp: List[Tuple], on_conflict: str) -> Tuple[int, int]:
+ resp_codes = [int(r[0]) for r in resp]
+ inserts = len([r for r in resp_codes if r == 0])
if on_conflict == "update":
- updates = len([r for r in resp if r != 0])
+ updates = len([r for r in resp_codes if r != 0])
else:
updates = 0
return (inserts, updates)
- def insert_cdx(self, cur, batch, on_conflict="nothing"):
+ def insert_cdx(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset)
@@ -110,26 +168,35 @@ class SandcrawlerPostgresClient:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d for d in batch if d.get('warc_path')]
+ batch = [d for d in batch if d.get("warc_path")]
if not batch:
return (0, 0)
- batch = [(d['url'],
- d['datetime'],
- d['sha1hex'],
- d['mimetype'],
- d['warc_path'],
- int(d['warc_csize']),
- int(d['warc_offset']))
- for d in batch]
+ rows = [
+ (
+ d["url"],
+ d["datetime"],
+ d["sha1hex"],
+ d["mimetype"],
+ d["warc_path"],
+ int(d["warc_csize"]),
+ int(d["warc_offset"]),
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (url, datetime)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_file_meta(self, cur, batch, on_conflict="nothing"):
+ def insert_file_meta(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype)
@@ -148,21 +215,24 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [(d['sha1hex'],
- d['sha256hex'],
- d['md5hex'],
- int(d['size_bytes']),
- d['mimetype'])
- for d in batch]
+ rows = [
+ (d["sha1hex"], d["sha256hex"], d["md5hex"], int(d["size_bytes"]), d["mimetype"])
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_grobid(self, cur, batch, on_conflict="nothing"):
+ def insert_grobid(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata)
@@ -184,33 +254,39 @@ class SandcrawlerPostgresClient:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
for r in batch:
- if r.get('metadata'):
+ if r.get("metadata"):
# sometimes these are only in metadata; shouldn't pass through
# though (to save database space)
- dupe_fields = ('fatcat_release', 'grobid_version')
+ dupe_fields = ("fatcat_release", "grobid_version")
for k in dupe_fields:
- if not k in r:
- r[k] = r['metadata'].get(k)
- r['metadata'].pop(k, None)
- r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
- batch = [(d['key'],
- d.get('grobid_version') or None,
- d['status_code'],
- d['status'],
- d.get('fatcat_release') or None,
- d.get('updated') or datetime.datetime.now(),
- d.get('metadata') or None ,
- )
- for d in batch]
+ if k not in r:
+ r[k] = r["metadata"].get(k)
+ r["metadata"].pop(k, None)
+ r["metadata"] = json.dumps(r["metadata"], sort_keys=True)
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["key"],
+ d.get("grobid_version") or None,
+ d["status_code"],
+ d["status"],
+ d.get("fatcat_release") or None,
+ d.get("updated") or now,
+ d.get("metadata") or None,
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_pdf_meta(self, cur, batch, on_conflict="nothing"):
+ def insert_pdf_meta(
+ self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing"
+ ) -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
"""
@@ -239,16 +315,17 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d.to_sql_tuple() for d in batch]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_html_meta(self, cur, batch, on_conflict="nothing"):
+ def insert_html_meta(
+ self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing"
+ ) -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
"""
@@ -274,16 +351,20 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d.to_sql_tuple() for d in batch]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_pdftrio(self, cur, batch, on_conflict="nothing"):
+ def insert_pdftrio(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
pdftrio (sha1hex, updated, status_code, status, pdftrio_version,
@@ -309,29 +390,36 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [
+ now = datetime.datetime.now()
+ rows = [
(
- d['key'],
- d.get('updated') or datetime.datetime.now(),
- d['status_code'],
- d['status'],
- d.get('versions', {}).get('pdftrio_version') or None,
- d.get('versions', {}).get('models_date') or None,
- d.get('ensemble_score'),
- d.get('bert_score'),
- d.get('linear_score'),
- d.get('image_score'),
+ d["key"],
+ d.get("updated") or now,
+ d["status_code"],
+ d["status"],
+ d.get("versions", {}).get("pdftrio_version") or None,
+ d.get("versions", {}).get("models_date") or None,
+ d.get("ensemble_score"),
+ d.get("bert_score"),
+ d.get("linear_score"),
+ d.get("image_score"),
)
- for d in batch]
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_ingest_request(self, cur, batch, on_conflict="nothing"):
+ def insert_ingest_request(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request)
@@ -345,35 +433,43 @@ class SandcrawlerPostgresClient:
sql += " RETURNING xmax;"
for r in batch:
# in case these fields were already packed into 'request'
- extra = r.get('request', {})
- for k in ('ext_ids', 'fatcat_release', 'edit_extra', 'rel'):
+ extra = r.get("request", {})
+ for k in ("ext_ids", "fatcat_release", "edit_extra", "rel"):
if r.get(k):
extra[k] = r[k]
if extra:
- r['extra'] = json.dumps(extra, sort_keys=True)
- batch = [(d['link_source'],
- d['link_source_id'],
- d['ingest_type'],
- d['base_url'],
- d.get('ingest_request_source'),
- d.get('release_stage') or None,
- d.get('extra') or None,
- )
- for d in batch]
+ r["extra"] = json.dumps(extra, sort_keys=True)
+ rows = [
+ (
+ d["link_source"],
+ d["link_source_id"],
+ d["ingest_type"],
+ d["base_url"],
+ d.get("ingest_request_source"),
+ d.get("release_stage") or None,
+ d.get("extra") or None,
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (link_source, link_source_id, ingest_type, base_url)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1], b[2], b[3])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1], b[2], b[3])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_ingest_file_result(self, cur, batch, on_conflict="nothing"):
+ def insert_ingest_file_result(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex)
VALUES %s
- ON CONFLICT ON CONSTRAINT ingest_file_result_pkey DO
+ ON CONFLICT ON CONSTRAINT ingest_file_result_pkey DO
"""
if on_conflict.lower() == "nothing":
sql += " NOTHING"
@@ -390,20 +486,165 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [(d['ingest_type'],
- d['base_url'],
- bool(d['hit']),
- d['status'],
- d.get('terminal_url'),
- d.get('terminal_dt'),
- d.get('terminal_status_code'),
- d.get('terminal_sha1hex'),
- )
- for d in batch]
+ rows = [
+ (
+ d["ingest_type"],
+ d["base_url"],
+ bool(d["hit"]),
+ d["status"],
+ d.get("terminal_url"),
+ d.get("terminal_dt"),
+ d.get("terminal_status_code"),
+ d.get("terminal_sha1hex"),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (ingest_type, base_url)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_ingest_fileset_platform(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ ingest_fileset_platform (ingest_type, base_url, hit, status, platform_name, platform_domain, platform_id, ingest_strategy, total_size, file_count, archiveorg_item_name, archiveorg_item_bundle_path, web_bundle_url, web_bundle_dt, manifest)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT ingest_fileset_platform_pkeypkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=now(),
+ hit=EXCLUDED.hit,
+ status=EXCLUDED.status,
+ platform_name=EXCLUDED.platform_name,
+ platform_domain=EXCLUDED.platform_domain,
+ platform_id=EXCLUDED.platform_id,
+ ingest_strategy=EXCLUDED.ingest_strategy,
+ total_size=EXCLUDED.total_size,
+ file_count=EXCLUDED.file_count,
+ archiveorg_item_name=EXCLUDED.archiveorg_item_name,
+ archiveorg_item_bundle_path=EXCLUDED.archiveorg_item_bundle_path,
+ web_bundle_url=EXCLUDED.web_bundle_url,
+ web_bundle_dt=EXCLUDED.web_bundle_dt,
+ manifest=EXCLUDED.manifest
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["ingest_type"],
+ d["base_url"],
+ bool(d["hit"]),
+ d["status"],
+ d.get("platform_name"),
+ d.get("platform_domain"),
+ d.get("platform_id"),
+ d.get("ingest_strategy"),
+ d.get("total_size"),
+ d.get("file_count"),
+ d.get("archiveorg_item_name"),
+ d.get("archiveorg_item_bundle_path"),
+ d.get("web_bundle_url"),
+ d.get("web_bundle_dt"),
+ d.get("manifest"),
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (ingest_type, base_url)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_crossref(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "update",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ crossref (doi, indexed, record)
+ VALUES %s
+ ON CONFLICT (doi) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ indexed=EXCLUDED.indexed,
+ record=EXCLUDED.record
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["doi"],
+ d.get("indexed") or None,
+ json.dumps(d["record"], sort_keys=True),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_grobid_refs(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "update",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ grobid_refs (source, source_id, source_ts, updated, refs_json)
+ VALUES %s
+ ON CONFLICT (source, source_id) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ source_ts=EXCLUDED.source_ts,
+ updated=EXCLUDED.updated,
+ refs_json=EXCLUDED.refs_json
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["source"],
+ d["source_id"],
+ d.get("source_ts") or None,
+ d.get("updated") or now,
+ json.dumps(d["refs_json"], sort_keys=True),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
new file mode 100644
index 0000000..5c13318
--- /dev/null
+++ b/python/sandcrawler/fileset_platforms.py
@@ -0,0 +1,832 @@
+import urllib.parse
+from typing import Optional, Tuple
+
+import internetarchive
+
+from sandcrawler.fileset_types import (
+ FilesetManifestFile,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
+from sandcrawler.html_metadata import BiblioMetadata
+from sandcrawler.ia import ResourceResult
+from sandcrawler.misc import requests_retry_session
+
+
+class FilesetPlatformHelper:
+ def __init__(self):
+ self.platform_name = "unknown"
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+ """
+ Does this request look like it matches this platform?
+ """
+ raise NotImplementedError()
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+ raise NotImplementedError()
+
+ def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
+ assert item.manifest
+ total_size = sum([m.size or 0 for m in item.manifest]) or 0
+ largest_size = max([m.size or 0 for m in item.manifest]) or 0
+ if len(item.manifest) == 1:
+ if total_size < 64 * 1024 * 1024:
+ return IngestStrategy.WebFile
+ else:
+ return IngestStrategy.ArchiveorgFile
+ else:
+ if largest_size < 64 * 1024 * 1024 and total_size < 128 * 1024 * 1024 * 1024:
+ return IngestStrategy.WebFileset
+ else:
+ return IngestStrategy.ArchiveorgFileset
+
+
+class DataverseHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "dataverse"
+ self.session = requests_retry_session()
+
+ @staticmethod
+ def parse_dataverse_persistentid(pid: str) -> dict:
+ """
+ Parses a persistentId into 5 sections:
+
+ - type (doi or hdl)
+ - authority (eg, DOI prefix)
+ - shoulder (optional, eg 'DVN')
+ - dataset_id (6-digit)
+ - file_id
+
+ The returned dict always has all components, which may be 'None' if optional.
+
+ This is possible because the dataverse software only supports a handful
+ of configurations and persistend identifier types.
+
+ If there is an error parsing, raises a ValueError
+ """
+ id_type = None
+ if pid.startswith("doi:10."):
+ id_type = "doi"
+ pid = pid[4:]
+ elif pid.startswith("hdl:"):
+ id_type = "hdl"
+ pid = pid[4:]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ comp = pid.split("/")
+ if len(comp) < 2:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ authority = comp[0]
+ shoulder = None
+ dataset_id = None
+ file_id = None
+ if len(comp[1]) != 6 and len(comp) == 3:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ elif len(comp[1]) != 6 and len(comp) == 4:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ file_id = comp[3]
+ elif len(comp[1]) == 6 and len(comp) == 2:
+ dataset_id = comp[1]
+ elif len(comp[1]) == 6 and len(comp) == 3:
+ dataset_id = comp[1]
+ file_id = comp[2]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ if len(dataset_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse dataset id: {dataset_id}")
+ if file_id and len(file_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse file id: {file_id}")
+
+ return {
+ "type": id_type,
+ "authority": authority,
+ "shoulder": shoulder,
+ "dataset_id": dataset_id,
+ "file_id": file_id,
+ }
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # TODO: could also do HTML platform detection or something?
+
+ components = urllib.parse.urlparse(url)
+ # platform_domain = components.netloc.split(':')[0].lower()
+ params = urllib.parse.parse_qs(components.query)
+ id_param = params.get("persistentId")
+ if not id_param:
+ return False
+ platform_id = id_param[0]
+
+ try:
+ self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ return False
+
+ return True
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+
+
+ HTTP GET https://demo.dataverse.org/api/datasets/export?exporter=dataverse_json&persistentId=doi:10.5072/FK2/J8SJZB
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ params = urllib.parse.parse_qs(components.query)
+ id_param = params.get("persistentId")
+ if not (id_param and id_param[0]):
+ raise PlatformScopeError("Expected a Dataverse persistentId in URL")
+ platform_id = id_param[0]
+ version_param = params.get("version")
+ dataset_version = None
+ if version_param:
+ dataset_version = version_param[0]
+
+ try:
+ parsed_id = self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ raise PlatformScopeError("not actually in scope")
+
+ if parsed_id["file_id"]:
+ # TODO: maybe we could support this?
+ raise PlatformScopeError(
+ "only entire dataverse datasets can be archived with this tool"
+ )
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ if not dataset_version:
+ resp = self.session.get(
+ f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+ if "latestVersion" not in obj["data"]:
+ raise PlatformScopeError("could not find latest version for dataverse record")
+ obj_latest = obj["data"]["latestVersion"]
+ dataset_version = (
+ f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+
+ # 2. API fetch
+ resp = self.session.get(
+ f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+
+ obj_latest = obj["data"]["latestVersion"]
+ assert (
+ dataset_version
+ == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+ assert platform_id == obj_latest["datasetPersistentId"]
+
+ manifest = []
+ for row in obj_latest["files"]:
+ df = row["dataFile"]
+ df_persistent_id = df["persistentId"]
+ platform_url = f"https://{platform_domain}/api/access/datafile/:persistentId/?persistentId={df_persistent_id}"
+ if df.get("originalFileName"):
+ platform_url += "&format=original"
+
+ extra = dict()
+ # TODO: always save the version field?
+ if row.get("version") != 1:
+ extra["version"] = row["version"]
+ if "description" in df:
+ extra["description"] = df["description"]
+ manifest.append(
+ FilesetManifestFile(
+ path=df.get("originalFileName") or df["filename"],
+ size=df.get("originalFileSize") or df["filesize"],
+ md5=df["md5"],
+ # NOTE: don't get: sha1, sha256
+ mimetype=df["contentType"],
+ platform_url=platform_url,
+ extra=extra or None,
+ )
+ )
+
+ platform_sub_id = platform_id.split("/")[-1]
+ archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ date=obj_latest["releaseTime"].split("T")[0],
+ source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}",
+ )
+ if platform_id.startswith("doi:10."):
+ archiveorg_item_meta["doi"] = platform_id.replace("doi:", "")
+ for block in obj_latest["metadataBlocks"]["citation"]["fields"]:
+ if block["typeName"] == "title":
+ archiveorg_item_meta["title"] = block["value"]
+ elif block["typeName"] == "depositor":
+ archiveorg_item_meta["creator"] = block["value"]
+ elif block["typeName"] == "dsDescription":
+ archiveorg_item_meta["description"] = block["value"][0]["dsDescriptionValue"][
+ "value"
+ ]
+
+ archiveorg_item_meta["description"] = archiveorg_item_meta.get("description", "")
+ if obj_latest.get("termsOfUse"):
+ archiveorg_item_meta["description"] += "\n<br>\n" + obj_latest["termsOfUse"]
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
+
+def test_parse_dataverse_persistentid() -> None:
+
+ valid = {
+ "doi:10.25625/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.25625",
+ "shoulder": None,
+ "dataset_id": "LL6WXZ",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": "LL6WXZ",
+ },
+ "hdl:20.500.12690/RIN/IDDOAH/BTNH25": {
+ "type": "hdl",
+ "authority": "20.500.12690",
+ "shoulder": "RIN",
+ "dataset_id": "IDDOAH",
+ "file_id": "BTNH25",
+ },
+ "doi:10.7910/DVN/6HPRIG": {
+ "type": "doi",
+ "authority": "10.7910",
+ "shoulder": "DVN",
+ "dataset_id": "6HPRIG",
+ "file_id": None,
+ },
+ }
+
+ invalid = [
+ # "doi:10.5072/FK2/J8SJZB/LL6WXZ",
+ "doi:10.25625/abcd",
+ "other:10.25625/LL6WXZ",
+ "10.25625/LL6WXZ",
+ "doi:10.5072/FK2/J8SJZB/LL6WXZv123",
+ ]
+
+ for pid, val in valid.items():
+ assert DataverseHelper.parse_dataverse_persistentid(pid) == val
+
+ for pid in invalid:
+ try:
+ DataverseHelper.parse_dataverse_persistentid(pid)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
+
+class FigshareHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "figshare"
+ self.session = requests_retry_session()
+
+ @staticmethod
+ def parse_figshare_url_path(path: str) -> Tuple[str, Optional[str]]:
+ """
+ Tries to parse a figshare URL into ID number and (optional) version number.
+
+ Returns a two-element tuple; version number will be None if not found
+
+ Raises a ValueError if not a figshare URL
+ """
+ # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1
+ # /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4
+
+ comp = path.split("/")
+ if len(comp) < 4 or comp[1] != "articles":
+ raise ValueError(f"not a figshare URL: {path}")
+
+ comp = comp[2:]
+ if comp[0] in [
+ "dataset",
+ # TODO: should the following be considered "out of scope"?
+ "journal_contribution",
+ "presentation",
+ "poster",
+ "thesis",
+ ]:
+ comp = comp[1:]
+
+ if len(comp) == 3 and comp[1].isdigit() and comp[2].isdigit():
+ return (comp[1], comp[2])
+ elif len(comp) == 2 and comp[1].isdigit():
+ return (comp[1], None)
+ else:
+ raise ValueError(f"couldn't find figshare identiier: {path}")
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+
+ # only work with full, versioned figshare.com URLs
+ if "figshare.com" not in platform_domain:
+ return False
+
+ try:
+ parsed = self.parse_figshare_url_path(components.path)
+ except ValueError:
+ return False
+
+ # has file component
+ if parsed[0] and parsed[1]:
+ return True
+
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+
+ (platform_id, dataset_version) = self.parse_figshare_url_path(components.path)
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+ assert (
+ dataset_version and dataset_version.isdigit()
+ ), f"expected numeric: {dataset_version}"
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ # TODO: implement this code path
+
+ # 2. API fetch
+ resp = self.session.get(
+ f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+
+ # figshare_type = obj['defined_type_name']
+
+ if not obj["is_public"]:
+ raise PlatformRestrictedError(f"record not public: {platform_id} {dataset_version}")
+ if obj["is_embargoed"]:
+ raise PlatformRestrictedError(
+ f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})'
+ )
+
+ manifest = []
+ for row in obj["files"]:
+ manifest.append(
+ FilesetManifestFile(
+ path=row["name"],
+ size=row["size"],
+ md5=row["computed_md5"],
+ # NOTE: don't get: sha1, sha256, mimetype
+ platform_url=row["download_url"],
+ # extra=dict(),
+ )
+ )
+ if row.get("is_link_only"):
+ raise PlatformScopeError(
+ f"figshare.org file is just a link (not a file): {row['name']} at {row['download_url']}"
+ )
+
+ authors = []
+ for author in obj["authors"]:
+ authors.append(author["full_name"])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj["doi"],
+ title=obj["title"],
+ date=obj["published_date"],
+ source=obj["url_public_html"],
+ description=obj["description"],
+ license=obj["license"]["url"],
+ version=obj["version"],
+ )
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
+
+def test_parse_figshare_url_path() -> None:
+
+ valid = {
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": (
+ "8987858",
+ "1",
+ ),
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": (
+ "8987858",
+ None,
+ ),
+ "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
+ "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": (
+ "12127176",
+ "4",
+ ),
+ "/articles/journal_contribution/Improved_Time_Resolved_Measurements_of_Inorganic_Ions_in_Particulate_Matter_by_PILS_IC_Integrated_with_a_Sample_Pre_Concentration_System/1407386/3": (
+ "1407386",
+ "3",
+ ),
+ "/articles/poster/Effect_of_nanoclay_loading_on_the_thermal_decomposition_of_nanoclay_polyurethane_elastomers_obtained_by_bulk_polymerization/1094056/1": (
+ "1094056",
+ "1",
+ ),
+ }
+
+ invalid = [
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species",
+ ]
+
+ for path, val in valid.items():
+ assert FigshareHelper.parse_figshare_url_path(path) == val
+
+ for path in invalid:
+ try:
+ FigshareHelper.parse_figshare_url_path(path)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
+
+class ZenodoHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "zenodo"
+ self.session = requests_retry_session()
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ if platform_domain == "zenodo.org" and "/record/" in components.path:
+ return True
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # TODO: also look in base_url and resource-non-terminal for ident? to
+ # check for work-level redirects
+
+ # 1. extract identifier from URL
+ # eg: https://zenodo.org/record/5230255
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ if len(components.path.split("/")) < 2:
+ raise PlatformScopeError("Expected a complete, versioned figshare URL")
+
+ platform_id = components.path.split("/")[2]
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+
+ if "zenodo.org" not in platform_domain:
+ raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}")
+
+ # 2. API fetch
+ resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}", timeout=60.0)
+ if resp.status_code == 410:
+ raise PlatformRestrictedError("record deleted")
+ resp.raise_for_status()
+ obj = resp.json()
+
+ assert obj["id"] == int(platform_id)
+ work_id = obj["conceptrecid"]
+ if work_id == obj["id"]:
+ raise PlatformScopeError(
+ "got a work-level zenodo record, not a versioned record: {work_id}"
+ )
+
+ # zenodo_type = obj['metadata']['resource_type']['type']
+
+ if obj["metadata"]["access_right"] != "open":
+ raise PlatformRestrictedError(
+ "not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}"
+ )
+
+ manifest = []
+ for row in obj["files"]:
+ mf = FilesetManifestFile(
+ path=row["key"],
+ size=row["size"],
+ platform_url=row["links"]["self"],
+ # extra=dict(),
+ )
+ checksum = row["checksum"]
+ # eg: md5:35ffcab905f8224556dba76648cb7dad
+ if checksum.startswith("md5:"):
+ mf.md5 = checksum[4:]
+ elif checksum.startswith("sha1:"):
+ mf.sha1 = checksum[45]
+ manifest.append(mf)
+
+ authors = []
+ for author in obj["metadata"]["creators"]:
+ authors.append(author["name"])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj["doi"],
+ title=obj["metadata"]["title"],
+ date=obj["metadata"]["publication_date"],
+ source=obj["links"]["html"],
+ description=obj["metadata"]["description"],
+ license=obj["metadata"]["license"]["id"],
+ version=obj["revision"],
+ # obj['metadata']['version'] is, eg, git version tag
+ )
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ # web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=obj["revision"]),
+ )
+
+
+class ArchiveOrgHelper(FilesetPlatformHelper):
+
+ FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+ }
+
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "archiveorg"
+ self.session = internetarchive.get_session()
+
+ @staticmethod
+ def want_item_file(f: internetarchive.File, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in [
+ "_academictorrents.torrent",
+ "_academictorrents_torrent.txt",
+ ".bib",
+ ]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+ patterns = [
+ "://archive.org/details/",
+ "://archive.org/download/",
+ ]
+ for p in patterns:
+ if p in url:
+ return True
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ base_url_split = request["base_url"].split("/")
+ # print(base_url_split, file=sys.stderr)
+ assert len(base_url_split) in [5, 6]
+ assert base_url_split[0] in ["http:", "https:"]
+ assert base_url_split[2] == "archive.org"
+ assert base_url_split[3] in ["details", "download"]
+ item_name = base_url_split[4]
+ if len(base_url_split) == 6 and base_url_split[5]:
+ raise PlatformScopeError(
+ "got an archive.org file path, not download/details page; individual files not handled yet"
+ )
+
+ # print(f" archiveorg processing item={item_name}", file=sys.stderr)
+ item = self.session.get_item(item_name)
+ item_name = item.identifier
+ item_collection = item.metadata["collection"]
+ if type(item_collection) == list:
+ item_collection = item_collection[0]
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ item_files = [f for f in item_files if self.want_item_file(f, item_name)]
+ manifest = []
+ for f in item_files:
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = FilesetManifestFile(
+ path=f.name,
+ size=int(f.size),
+ sha1=f.sha1,
+ md5=f.md5,
+ mimetype=self.FORMAT_TO_MIMETYPE[f.format],
+ platform_url=f"https://archive.org/download/{item_name}/{f.name}",
+ )
+ manifest.append(mf)
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain="archive.org",
+ platform_id=item_name,
+ archiveorg_item_name=item_name,
+ archiveorg_meta=dict(collection=item_collection),
+ )
+
+ def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
+ """
+ Don't use default strategy picker; we are always doing an 'existing' in this case.
+ """
+ assert item.manifest is not None
+ if len(item.manifest) == 1:
+ # NOTE: code flow does not support ArchiveorgFilesetBundle for the
+ # case of, eg, a single zipfile in an archive.org item
+ return IngestStrategy.ArchiveorgFile
+ elif len(item.manifest) >= 1:
+ return IngestStrategy.ArchiveorgFileset
+ else:
+ raise NotImplementedError("empty dataset")
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
new file mode 100644
index 0000000..1d84ce5
--- /dev/null
+++ b/python/sandcrawler/fileset_strategies.py
@@ -0,0 +1,387 @@
+import os
+import shutil
+import sys
+from typing import Optional
+
+import internetarchive
+import requests
+
+from sandcrawler.fileset_types import (
+ ArchiveStrategyResult,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformScopeError,
+)
+from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
+from sandcrawler.misc import (
+ gen_file_metadata,
+ gen_file_metadata_path,
+ requests_retry_session,
+ sanitize_fs_path,
+)
+
+
+class FilesetIngestStrategy:
+ def __init__(self):
+ # self.ingest_strategy = 'unknown'
+ self.success_status = "success"
+
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ raise NotImplementedError()
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ raise NotImplementedError()
+
+
+class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+
+ # TODO: enable cleanup when confident (eg, safe path parsing)
+ self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True)
+ self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/")
+ try:
+ os.mkdir(self.working_dir)
+ except FileExistsError:
+ pass
+
+ self.http_session = requests_retry_session()
+ self.ia_session = internetarchive.get_session(
+ config={
+ "s3": {
+ "access": os.environ.get("IA_ACCESS_KEY"),
+ "secret": os.environ.get("IA_SECRET_KEY"),
+ },
+ }
+ )
+
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ """
+ use API to check for item with all the files in the manifest
+
+ NOTE: this naive comparison is quadratic in number of files, aka O(N^2)
+ """
+ ia_item = self.ia_session.get_item(item.archiveorg_item_name)
+ if not ia_item.exists:
+ return None
+ item_files = ia_item.get_files(on_the_fly=False)
+ assert item.manifest
+ for wanted in item.manifest:
+ found = False
+ for existing in item_files:
+ if existing.name == wanted.path:
+ if (
+ (
+ (existing.sha1 and existing.sha1 == wanted.sha1)
+ or (existing.md5 and existing.md5 == wanted.md5)
+ )
+ and existing.name == wanted.path
+ and existing.size == wanted.size
+ ):
+ found = True
+ wanted.status = "exists"
+ break
+ else:
+ wanted.status = "mismatch-existing"
+ break
+ if not found:
+ print(
+ f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}",
+ file=sys.stderr,
+ )
+ return None
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status="success-existing",
+ manifest=item.manifest,
+ )
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ """
+ May require extra context to pass along to archive.org item creation.
+ """
+ existing = self.check_existing(item)
+ if existing:
+ return existing
+
+ if item.platform_name == "archiveorg":
+ raise PlatformScopeError("shouldn't download archive.org into itself")
+
+ local_dir = self.working_dir + item.archiveorg_item_name
+ assert local_dir.startswith("/")
+ assert local_dir.count("/") > 2
+ try:
+ os.mkdir(local_dir)
+ except FileExistsError:
+ pass
+
+ # 1. download all files locally
+ assert item.manifest
+ for m in item.manifest:
+ if m.path != sanitize_fs_path(m.path):
+ m.status = "unsafe-path"
+ continue
+
+ local_path = local_dir + "/" + m.path
+ assert m.platform_url
+
+ if not os.path.exists(os.path.dirname(local_path)):
+ os.mkdir(os.path.dirname(local_path))
+ if os.path.exists(local_path):
+ m.status = "exists-local"
+ else:
+ print(f" downloading {m.path}", file=sys.stderr)
+ # create any sub-directories for this path, if necessary
+ if not os.path.exists(os.path.dirname(local_path)):
+ os.mkdir(os.path.dirname(local_path))
+ try:
+ with self.http_session.get(
+ m.platform_url,
+ stream=True,
+ allow_redirects=True,
+ timeout=2 * 60 * 60,
+ ) as r:
+ r.raise_for_status()
+ with open(local_path + ".partial", "wb") as f:
+ for chunk in r.iter_content(chunk_size=256 * 1024):
+ f.write(chunk)
+ os.rename(local_path + ".partial", local_path)
+ m.status = "downloaded-local"
+ except requests.exceptions.RequestException:
+ m.status = "error-platform-download"
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-platform-download",
+ )
+
+ print(f" verifying {m.path}", file=sys.stderr)
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ if file_meta["size_bytes"] != m.size:
+ print(f" expected: {m.size} found: {file_meta['size_bytes']}", file=sys.stderr)
+ m.status = "mismatch-size"
+ continue
+
+ if m.sha1:
+ if file_meta["sha1hex"] != m.sha1:
+ m.status = "mismatch-sha1"
+ continue
+ else:
+ m.sha1 = file_meta["sha1hex"]
+
+ if m.sha256:
+ if file_meta["sha256hex"] != m.sha256:
+ m.status = "mismatch-sha256"
+ continue
+ else:
+ m.sha256 = file_meta["sha256hex"]
+
+ if m.md5:
+ if file_meta["md5hex"] != m.md5:
+ m.status = "mismatch-md5"
+ continue
+ else:
+ m.md5 = file_meta["md5hex"]
+
+ if m.mimetype:
+ # 'magic' isn't good and parsing more detailed text file formats like text/csv
+ if (
+ file_meta["mimetype"] != m.mimetype
+ and file_meta["mimetype"] != "text/plain"
+ ):
+ # these 'tab-separated-values' from dataverse are just noise, don't log them
+ if m.mimetype != "text/tab-separated-values":
+ print(
+ f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}",
+ file=sys.stderr,
+ )
+ m.mimetype = file_meta["mimetype"]
+ else:
+ m.mimetype = file_meta["mimetype"]
+ m.status = "verified-local"
+
+ # if verification failed for any individual files, bail out
+ for m in item.manifest:
+ if m.status != "verified-local":
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status=m.status,
+ )
+
+ # 2. upload all files, with metadata
+ assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
+ item_files = {}
+ for m in item.manifest:
+ local_path = local_dir + "/" + m.path
+ if m.path == "name":
+ raise NotImplementedError(
+ "fileset file path is 'name', which is a reserved keyword"
+ )
+ item_files[m.path] = local_path
+ if len(item_files) != len(item.manifest):
+ raise NotImplementedError("file/manifest length mismatch: duplicated file paths?")
+
+ print(
+ f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
+ file=sys.stderr,
+ )
+ try:
+ internetarchive.upload(
+ item.archiveorg_item_name,
+ files=item_files,
+ metadata=item.archiveorg_item_meta,
+ checksum=True,
+ queue_derive=False,
+ verify=True,
+ )
+ except requests.exceptions.RequestException:
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-archiveorg-upload",
+ )
+
+ for m in item.manifest:
+ m.status = "success"
+
+ # 4. delete local directory
+ if not self.skip_cleanup_local_files:
+ shutil.rmtree(local_dir)
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=self.success_status,
+ manifest=item.manifest,
+ )
+
+ return result
+
+
+class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
+ """
+ ArchiveorgFilesetStrategy currently works fine with individual files. Just
+ need to over-ride the ingest_strategy name.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+ self.success_status = "success-file"
+
+
+class WebFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.WebFileset
+ self.wayback_client = WaybackClient()
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
+ self.max_spn_manifest = 20
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ """
+ For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
+
+ TODO:
+ - full fetch_resource() method which can do SPN requests
+ """
+
+ assert item.manifest
+ file_file_meta = None
+ file_resource = None
+ for m in item.manifest:
+ fetch_url = m.platform_url
+ if not fetch_url:
+ raise NotImplementedError(
+ "require 'platform_url' for each file when doing Web fetching"
+ )
+
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
+
+ if self.try_spn2 and (
+ resource is None or (resource and resource.status == "no-capture")
+ ):
+ if len(item.manifest) > self.max_spn_manifest:
+ m.status = "too-much-spn"
+ continue
+ via = "spn2"
+ resource = self.spn_client.crawl_resource(
+ fetch_url, self.wayback_client, force_simple_get=True
+ )
+
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via,
+ (resource and resource.status),
+ (resource and resource.terminal_url) or fetch_url,
+ ),
+ file=sys.stderr,
+ )
+
+ m.terminal_url = resource.terminal_url
+ m.terminal_dt = resource.terminal_dt
+ m.status = resource.status
+ if self.ingest_strategy == "web-file":
+ file_resource = resource
+
+ if resource.status != "success":
+ continue
+ else:
+ assert resource.terminal_status_code == 200
+
+ if not resource.body:
+ m.status = "empty-blob"
+ continue
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, _html_resource = fix_transfer_encoding(file_meta, resource)
+ except Exception:
+ m.status = "transfer-encoding-error"
+ continue
+
+ if self.ingest_strategy == "web-file":
+ file_file_meta = file_meta
+
+ if (
+ file_meta["size_bytes"] != m.size
+ or (m.md5 and m.md5 != file_meta["md5hex"])
+ or (m.sha1 and m.sha1 != file_meta["sha1hex"])
+ ):
+ m.status = "mismatch"
+ continue
+
+ m.md5 = m.md5 or file_meta["md5hex"]
+ m.sha1 = m.sha1 or file_meta["sha1hex"]
+ m.sha256 = m.sha256 or file_meta["sha256hex"]
+ m.mimetype = m.mimetype or file_meta["mimetype"]
+
+ overall_status = self.success_status
+ for m in item.manifest:
+ if m.status != "success":
+ overall_status = m.status or "not-processed"
+ break
+ if not item.manifest:
+ overall_status = "empty-manifest"
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=overall_status,
+ manifest=item.manifest,
+ )
+ if self.ingest_strategy == "web-file":
+ result.file_file_meta = file_file_meta
+ result.file_resource = file_resource
+ return result
+
+
+class WebFileStrategy(WebFilesetStrategy):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.ingest_strategy = IngestStrategy.WebFile
+ self.success_status = "success-file"
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
new file mode 100644
index 0000000..3398833
--- /dev/null
+++ b/python/sandcrawler/fileset_types.py
@@ -0,0 +1,74 @@
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel
+
+
+class IngestStrategy(str, Enum):
+ WebFile = "web-file"
+ WebFileset = "web-fileset"
+ WebFilesetBundled = "web-fileset-bundled"
+ ArchiveorgFile = "archiveorg-file"
+ ArchiveorgFileset = "archiveorg-fileset"
+ ArchiveorgFilesetBundled = "archiveorg-fileset-bundled"
+
+
+class FilesetManifestFile(BaseModel):
+ path: str
+ size: Optional[int]
+ md5: Optional[str]
+ sha1: Optional[str]
+ sha256: Optional[str]
+ mimetype: Optional[str]
+ extra: Optional[Dict[str, Any]]
+
+ status: Optional[str]
+ platform_url: Optional[str]
+ terminal_url: Optional[str]
+ terminal_dt: Optional[str]
+
+
+class FilesetPlatformItem(BaseModel):
+ platform_name: str
+ platform_status: str
+ platform_domain: Optional[str]
+ platform_id: Optional[str]
+ manifest: Optional[List[FilesetManifestFile]]
+
+ archiveorg_item_name: Optional[str]
+ archiveorg_item_meta: Optional[dict]
+ web_base_url: Optional[str]
+ web_bundle_url: Optional[str]
+
+
+class ArchiveStrategyResult(BaseModel):
+ ingest_strategy: str
+ status: str
+ manifest: List[FilesetManifestFile]
+ file_file_meta: Optional[Dict[str, Any]]
+ file_resource: Optional[Any]
+ bundle_file_meta: Optional[Dict[str, Any]]
+ bundle_resource: Optional[Any]
+ bundle_archiveorg_path: Optional[str]
+
+
+class PlatformScopeError(Exception):
+ """
+ For incidents where platform helper discovers that the fileset/dataset is
+ out-of-cope after already starting to process it.
+
+ For example, attempting to ingest:
+
+ - a 'latest version' record, when the platform has version-specific records
+ - a single file within a dataset for a platform which has file-level identifiers
+ """
+
+ pass
+
+
+class PlatformRestrictedError(Exception):
+ """
+ When datasets are not publicly available on a platform (yet)
+ """
+
+ pass
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index b010b2c..aa2c112 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,92 +1,301 @@
+import html
+import sys
+import time
+import xml.etree.ElementTree
+from typing import Any, Dict, List, Optional
import requests
+from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml
-from grobid2json import teixml2json
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
-from .misc import gen_file_metadata
+from .ia import WaybackClient
+from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
+MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte
+
+
+def clean_crossref_unstructured(raw: str) -> str:
+ """
+ Applies Crossref-specific cleanups to an 'unstructured' citation string.
+ """
+
+ # detect repeated strings with double space separating them
+ subs = raw.split(" ")
+ if len(subs) == 2 and subs[0] == subs[1]:
+ raw = subs[0]
+ else:
+ raw = " ".join(subs)
+
+ # remove HTML/XML numeric characters
+ if "&#" in raw or "&amp;" in raw or "&gt;" in raw or "&lt;" in raw:
+ raw = html.unescape(raw)
+
+ raw.replace(" ", " ")
+ raw = raw.strip()
+ return raw
+
+
+def test_clean_ref_str() -> None:
+ # NOTE: this as emdash, non-breaking string characters in it
+ raw_with_nbsp = """Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394. Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394."""
+ cleaned = """Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394."""
+ assert clean_crossref_unstructured(raw_with_nbsp) == cleaned
+
+ # HTML escape characters
+ assert (
+ clean_crossref_unstructured(
+ "J-B Champion, C.Collin, INSEE Premi&#232;re N&#176;1710 september 2018 - National Institute of Statistics and Economic Studies"
+ )
+ == "J-B Champion, C.Collin, INSEE Première N°1710 september 2018 - National Institute of Statistics and Economic Studies"
+ )
+
+ # simple doubling
+ assert (
+ clean_crossref_unstructured("https://graph500.org/. https://graph500.org/.")
+ == "https://graph500.org/."
+ )
+ assert (
+ clean_crossref_unstructured(
+ """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg. Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
+ )
+ == """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
+ )
+
+ # all non-breaking whitespace
+ assert (
+ clean_crossref_unstructured(
+ "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
+ )
+ == ""
+ )
-class GrobidClient(object):
- def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+class GrobidClient(object):
+ def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
- self.consolidate_mode = int(kwargs.get('consolidate_mode', 2))
+ self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
+ self.session = requests_retry_session()
- def process_fulltext(self, blob, consolidate_mode=None):
+ def process_fulltext(
+ self, blob: bytes, consolidate_mode: Optional[int] = None
+ ) -> Dict[str, Any]:
"""
Returns dict with keys:
- status_code
- status (slug)
- error_msg (if status == 'error')
- tei_xml (if status is 200)
-
- TODO: persist connection for performance?
"""
assert blob
- if consolidate_mode == None:
+ if len(blob) > MAX_GROBID_BLOB_SIZE:
+ return {
+ "status": "blob-too-large",
+ "error_msg": f"Not going to process very large file ({len(blob)} bytes)",
+ }
+
+ if consolidate_mode is None:
consolidate_mode = self.consolidate_mode
+ assert consolidate_mode is not None
try:
- grobid_response = requests.post(
+ grobid_response = self.session.post(
self.host_url + "/api/processFulltextDocument",
files={
- 'input': blob,
- 'consolidateHeader': self.consolidate_mode,
- 'consolidateCitations': 0, # too expensive for now
- 'includeRawCitations': 1,
+ "input": blob,
+ },
+ data={
+ "consolidateHeader": consolidate_mode,
+ "consolidateCitations": 0, # too expensive for now
+ "includeRawCitations": 1,
+ "includeRawAffiliations": 1,
+ "teiCoordinates": ["ref", "figure", "persName", "formula", "biblStruct"],
+ "segmentSentences": 1,
},
timeout=180.0,
)
except requests.Timeout:
return {
- 'status': 'error-timeout',
- 'status_code': -4, # heritrix3 "HTTP timeout" code
- 'error_msg': 'GROBID request (HTTP POST) timeout',
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "GROBID request (HTTP POST) timeout",
}
+ except requests.exceptions.ConnectionError as ce:
+ # intentionally raising this, so workers crash when GROBID
+ # unavailable. but do add a sleep to slow things down.
+ print(
+ "GROBID ConnectionError. sleeping as a slow-down before crashing",
+ file=sys.stderr,
+ )
+ time.sleep(5.0)
+ raise ce
- info = dict(
- status_code=grobid_response.status_code,
- )
+ info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
if grobid_response.status_code == 200:
- info['status'] = 'success'
- info['tei_xml'] = grobid_response.text
- if len(info['tei_xml']) > 12000000:
+ info["status"] = "success"
+ info["tei_xml"] = grobid_response.text
+ if len(info["tei_xml"]) > 12000000:
# XML is larger than Kafka message size, and much larger than
# an article in general; bail out
- info['status'] = 'error'
- info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml']))
- info.pop('tei_xml')
+ info["status"] = "error"
+ info["error_msg"] = "response XML too large: {} bytes".format(
+ len(info["tei_xml"])
+ )
+ info.pop("tei_xml")
else:
# response.text is .content decoded as utf-8
- info['status'] = 'error'
- info['error_msg'] = grobid_response.text[:10000]
+ info["status"] = "error"
+ info["error_msg"] = grobid_response.text[:10000]
return info
- def metadata(self, result):
- if result['status'] != 'success':
+ def process_citation_list(self, unstructured_list: List[str]) -> List[GrobidBiblio]:
+ if not unstructured_list:
+ return []
+ if len(unstructured_list) > 5000:
+ raise ValueError("more than 5,000 references in a batch is just too much")
+
+ try:
+ grobid_response = self.session.post(
+ self.host_url + "/api/processCitationList",
+ data={
+ "citations": unstructured_list,
+ "consolidateCitations": 0,
+ "includeRawCitations": 1,
+ },
+ timeout=30.0,
+ )
+ except requests.Timeout as te:
+ # TODO: handle somehow?
+ raise te
+
+ grobid_response.raise_for_status()
+ return parse_citation_list_xml(grobid_response.text)
+
+ def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ if result["status"] != "success":
return None
- tei_json = teixml2json(result['tei_xml'], encumbered=False)
+ try:
+ tei_doc = parse_document_xml(result["tei_xml"])
+ except xml.etree.ElementTree.ParseError as pe:
+ result["status"] = "bad-grobid-xml"
+ return dict(error_msg=str(pe)[:1000])
+ tei_doc.remove_encumbered()
+ tei_json = tei_doc.to_legacy_dict()
meta = dict()
biblio = dict()
- for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+ for k in (
+ "title",
+ "authors",
+ "journal",
+ "date",
+ "doi",
+ ):
if tei_json.get(k):
biblio[k] = tei_json[k]
- meta['biblio'] = biblio
- for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+ meta["biblio"] = biblio
+ for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"):
if tei_json.get(k):
meta[k] = tei_json[k]
return meta
-class GrobidWorker(SandcrawlerFetchWorker):
+ def should_parse_crossref_ref(self, ref: Dict[str, Any]) -> bool:
+ """
+ Helper function to decide whether to run GROBID parsing on an crossref
+ reference.
- def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
+ For example, if there is already a DOI in the ref metadata, could skip.
+ Or, if there is sufficient structured metadata, or only depending on
+ the source of the DOI linkage.
+ """
+ if ref.get("DOI"):
+ return False
+ if len(ref.get("unstructured", "").strip()) <= 6:
+ return False
+
+ if (
+ ref.get("year")
+ and ref.get("author")
+ and (ref.get("article-title") or ref.get("series-title") or ref.get("volume-title"))
+ ):
+ return False
+ elif ref.get("year") and ref.get("author") and ref.get("journal-title"):
+ return False
+ elif ref.get("journal-title") and ref.get("volume") and ref.get("first-page"):
+ return False
+
+ return True
+
+ def crossref_refs(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Given a complete Crossref metadata record, inspects the
+
+ The returned dict is in the schema of the `grobid_refs` database table,
+ in dict form:
+
+ source: 'crossref'
+ source_id: doi, as lower-case string
+ source_ts: Crossref indexed timestamp, if available
+ ('updated' is not set)
+ refs_json: list of dicts
+ """
+
+ # remove API wrapper around record, if necessary
+ if "message" in record and "DOI" not in record:
+ record = record["message"]
+
+ ret = dict(
+ source="crossref",
+ source_id=record["DOI"].lower(),
+ source_ts=record["indexed"]["date-time"],
+ refs_json=[],
+ )
+ all_refs = record.get("reference", [])
+ unstructured_refs = []
+ for r in all_refs:
+ if not r.get("unstructured"):
+ continue
+ if not self.should_parse_crossref_ref(r):
+ continue
+ unstructured_refs.append(r)
+ if not unstructured_refs:
+ return ret
+
+ # some reasonable cap on length of refs per work
+ if len(unstructured_refs) > 2000:
+ print(
+ f"truncating very large reference list for doi:{record['DOI']} len:{len(unstructured_refs)}",
+ file=sys.stderr,
+ )
+ unstructured_refs = unstructured_refs[:2000]
+
+ clean_refs = [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs]
+ refs = self.process_citation_list(clean_refs)
+
+ assert len(refs) == len(unstructured_refs)
+ refs_json = []
+ for i in range(len(refs)):
+ refs[i].id = unstructured_refs[i].get("key")
+ refs[i].index = None
+ refs_json.append(refs[i].to_dict())
+ ret["refs_json"] = refs_json
+ return ret
+
+
+class GrobidWorker(SandcrawlerFetchWorker):
+ def __init__(
+ self,
+ grobid_client: GrobidClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs,
+ ):
super().__init__(wayback_client=wayback_client)
self.grobid_client = grobid_client
self.sink = sink
- self.consolidate_mode = 2
+ self.consolidate_mode = 0
- def timeout_response(self, task):
- default_key = task['sha1hex']
+ def timeout_response(self, task: Any) -> Any:
+ default_key = task["sha1hex"]
return dict(
status="error-timeout",
error_msg="internal GROBID worker timeout",
@@ -94,37 +303,74 @@ class GrobidWorker(SandcrawlerFetchWorker):
key=default_key,
)
- def process(self, record, key=None):
- default_key = record['sha1hex']
-
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
fetch_result = self.fetch_blob(record)
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
- result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['source'] = record
- result['key'] = result['file_meta']['sha1hex']
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["source"] = record
+ result["key"] = result["file_meta"]["sha1hex"]
return result
+
+class CrossrefRefsWorker(SandcrawlerWorker):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.grobid_client = grobid_client
+ self.sink = sink
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ # handle the rare case of bad TEI-XML response
+ # eg: https://github.com/kermitt2/grobid/issues/848
+ try:
+ return self.grobid_client.crossref_refs(record)
+ except xml.etree.ElementTree.ParseError:
+ print(
+ f"GROBID returned bad XML for Crossref DOI: {record.get('DOI')}",
+ file=sys.stderr,
+ )
+ # but add a small slow-down so we don't churn through these if
+ # GROBID is just misconfigured or something
+ time.sleep(3)
+ return None
+ except requests.exceptions.HTTPError:
+ print(f"GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+ except requests.exceptions.ReadTimeout:
+ print(f"GROBID HTTP timeout for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+
+
class GrobidBlobWorker(SandcrawlerWorker):
"""
This is sort of like GrobidWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, grobid_client, sink=None, **kwargs):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
super().__init__()
self.grobid_client = grobid_client
self.sink = sink
- self.consolidate_mode = 2
+ self.consolidate_mode = 0
- def process(self, blob, key=None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
if not blob:
return None
- result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
return result
-
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 14561bf..207f067 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,18 +1,20 @@
-
+import json
import re
import sys
-import json
import urllib.parse
+from typing import Any, Dict
from bs4 import BeautifulSoup
-RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"')
+RESEARCHSQUARE_REGEX = re.compile(
+ r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"'
+)
IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"')
OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
-def extract_fulltext_url(html_url, html_body):
+def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
"""
Takes an HTML document (and URL), assumed to be a landing page, and tries
to find a fulltext PDF url.
@@ -20,9 +22,9 @@ def extract_fulltext_url(html_url, html_body):
On error, or if fails to extract a URL, returns an empty dict.
"""
- host_prefix = '/'.join(html_url.split('/')[:3])
+ host_prefix = "/".join(html_url.split("/")[:3])
try:
- soup = BeautifulSoup(html_body, 'html.parser')
+ soup = BeautifulSoup(html_body, "html.parser")
except TypeError as te:
print(f"{te} (url={html_url})", file=sys.stderr)
return dict()
@@ -30,75 +32,75 @@ def extract_fulltext_url(html_url, html_body):
print(f"{ule} (url={html_url})", file=sys.stderr)
return dict()
- ### General Tricks ###
+ # ignoring most type checks on bs4 output in this function (which is partially deprecated)
+ meta: Any
+ url: Any
+ redirect: Any
- # highwire-style meta tag
- meta = soup.find('meta', attrs={"name":"citation_pdf_url"})
- if not meta:
- meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
- if not meta:
- meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"})
- if not meta:
- # researchgate does this; maybe others also?
- meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
- if not meta:
- meta = soup.find('meta', attrs={"name":"eprints.document_url"})
- # if tag is only partially populated
- if meta and not meta.get('content'):
- meta = None
- # wiley has a weird almost-blank page we don't want to loop on
- if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
- url = meta['content'].strip()
- if '://doi.org/' in url:
- print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
- elif url.startswith('/'):
- if host_prefix+url == html_url:
- print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
- else:
- return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
- elif url.startswith('http'):
- if url == html_url:
- print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
- else:
- return dict(pdf_url=url, technique='citation_pdf_url')
- else:
- print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+ ### General Tricks ###
+ # note: most of these have migrated to the html_biblio code path
- meta = soup.find('meta', attrs={"name":"generator"})
+ meta = soup.find("meta", attrs={"name": "generator"})
meta_generator = None
- if meta and meta.get('content'):
- meta_generator = meta['content'].strip()
+ if meta and meta.get("content"):
+ meta_generator = meta["content"].strip()
### Publisher/Platform Specific ###
# research square (researchsquare.com)
- if 'researchsquare.com/article/' in html_url:
+ if "researchsquare.com/article/" in html_url:
# JSON in body with a field like:
# "url":"https://assets.researchsquare.com/files/4a57970e-b002-4608-b507-b95967649483/v2/Manuscript.pdf"
- m = RESEARCHSQUARE_REGEX.search(html_body.decode('utf-8'))
+ m = RESEARCHSQUARE_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(release_stage="manuscript", pdf_url=url, technique='publisher')
+ return dict(release_stage="manuscript", pdf_url=url, technique="publisher")
# elseiver linking hub
# https://linkinghub.elsevier.com/retrieve/pii/S1569199319308975
- if '://linkinghub.elsevier.com/retrieve/pii/' in html_url:
+ if "://linkinghub.elsevier.com/retrieve/pii/" in html_url:
# <input type="hidden" name="redirectURL" value="http%3A%2F%2Fcysticfibrosisjournal.com%2Fretrieve%2Fpii%2FS1569199319308975" id="redirectURL"/>
redirect = soup.find("input", attrs={"name": "redirectURL"})
if redirect:
- url = redirect['value'].strip()
- if 'http' in url:
+ url = redirect["value"].strip()
+ if "http" in url:
url = urllib.parse.unquote(url)
# drop any the query parameter
- url = url.split('?via')[0]
+ url = url.split("?via")[0]
return dict(next_url=url, technique="elsevier-linkinghub")
+ # sciencedirect PDF URL extract
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670
+ if "sciencedirect.com/science/article/pii/" in html_url and not html_url.endswith(".pdf"):
+ json_tag: Any = soup.find(
+ "script", attrs={"type": "application/json", "data-iso-key": "_0"}
+ )
+ url = None
+ if json_tag:
+ try:
+ json_text = json_tag.string
+ json_meta = json.loads(json_text)
+ pdf_meta = json_meta["article"]["pdfDownload"]["urlMetadata"]
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+ url = (
+ html_url
+ + pdf_meta["pdfExtension"]
+ + "?md5="
+ + pdf_meta["queryParams"]["md5"]
+ + "&pid="
+ + pdf_meta["queryParams"]["pid"]
+ )
+ except (KeyError, TypeError, json.JSONDecodeError):
+ pass
+ if url:
+ return dict(pdf_url=url, technique="sciencedirect-munge-json")
+
# sciencedirect PDF bounce page
# https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf
- if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"):
+ if "://www.sciencedirect.com/" in html_url and html_url.endswith(".pdf"):
# window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=[...]&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=[...]&hash=[...]&host=[...]&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=[...]&type=client';
- m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode('utf-8'))
+ m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4000
@@ -106,85 +108,108 @@ def extract_fulltext_url(html_url, html_body):
# ieeexplore.ieee.org
# https://ieeexplore.ieee.org/document/8730316
- if '://ieeexplore.ieee.org/document/' in html_url:
+ if "://ieeexplore.ieee.org/document/" in html_url:
# JSON in body with a field like:
# "pdfPath":"/iel7/6287639/8600701/08730316.pdf",
- m = IEEEXPLORE_REGEX.search(html_body.decode('utf-8'))
+ m = IEEEXPLORE_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(release_stage="published", pdf_url=host_prefix+url, technique="ieeexplore")
+ return dict(
+ release_stage="published", pdf_url=host_prefix + url, technique="ieeexplore"
+ )
# https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313
- if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url:
+ if "://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber" in html_url:
# HTML iframe like:
# <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&amp;arnumber=8730313&amp;isnumber=8600701&amp;ref=" frameborder="0"></iframe>
- iframe = soup.find("iframe")
- if iframe and '.pdf' in iframe['src']:
- return dict(pdf_url=iframe['src'], technique="iframe")
+ iframe: Any = soup.find("iframe")
+ if iframe and ".pdf" in iframe["src"]:
+ return dict(pdf_url=iframe["src"], technique="iframe")
# https://insights.ovid.com/crossref?an=00042307-202001000-00013
# Ovid is some kind of landing page bounce portal tracking run-around.
# Can extract actual journal URL from javascript blob in the HTML
- if '://insights.ovid.com/crossref' in html_url:
+ if "://insights.ovid.com/crossref" in html_url:
# var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
- m = OVID_JOURNAL_URL_REGEX.search(html_body.decode('utf-8'))
+ m = OVID_JOURNAL_URL_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(next_url=url, technique='ovid')
+ return dict(next_url=url, technique="ovid")
# osf.io
# https://osf.io/8phvx/
# https://osf.io/preprints/socarxiv/8phvx/
# wow, they ship total javascript crud! going to just guess download URL
# based on URL for now. Maybe content type header would help?
- if '://osf.io/' in html_url and not '/download' in html_url:
- if not html_url.endswith("/"):
- next_url = html_url+"/download"
- else:
- next_url = html_url+"download"
- return dict(next_url=next_url, technique='osf-by-url')
+ OSF_DOMAINS = [
+ "://osf.io/",
+ "://biohackrxiv.org/",
+ "://psyarxiv.com/",
+ "://arabixiv.org/",
+ "://engrxiv.org/",
+ "://edarxiv.org//",
+ "://ecsarxiv.org/",
+ "://ecoevorxiv.org/",
+ "://frenxiv.org/",
+ "://indiarxiv.org/",
+ "://mindrxiv.org/",
+ "://mediarxiv.org/",
+ "://paleorxiv.org/",
+ "://thesiscommons.org/",
+ ]
+ for domain in OSF_DOMAINS:
+ if (
+ domain in html_url
+ and (len(html_url.split("/")) in [4, 5] or "/preprints/" in html_url)
+ and "/download" not in html_url
+ ):
+ if not html_url.endswith("/"):
+ next_url = html_url + "/download"
+ else:
+ next_url = html_url + "download"
+ return dict(next_url=next_url, technique="osf-by-url")
# wiley
# https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787
if "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
if b"/doi/pdfdirect/" in html_body:
- next_url = html_url.replace('/doi/pdf/', '/doi/pdfdirect/')
- return dict(next_url=next_url, technique='wiley-pdfdirect')
+ next_url = html_url.replace("/doi/pdf/", "/doi/pdfdirect/")
+ return dict(next_url=next_url, technique="wiley-pdfdirect")
# arxiv abstract pages
if "://arxiv.org/abs/" in html_url:
url = html_url.replace("/abs/", "/pdf/")
- return dict(pdf_url=url, technique='arxiv-url')
+ return dict(pdf_url=url, technique="arxiv-url")
# american archivist (OA)
# https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
- if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
+ if "://americanarchivist.org/doi/" in html_url and "/doi/pdf" not in html_url:
# use a more aggressive direct guess to avoid rate-limiting...
if "/doi/10." in html_url:
url = html_url.replace("/doi/10.", "/doi/pdf/10.")
- return dict(pdf_url=url, technique='archivist-url')
+ return dict(pdf_url=url, technique="archivist-url")
# <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
- hrefs = soup.find_all('a', attrs={"target":"_blank"})
+ hrefs = soup.find_all("a", attrs={"target": "_blank"})
for href in hrefs:
- url = href['href'].strip()
+ url = href["href"].strip()
if "/doi/pdf/" in url:
- if url.startswith('http'):
- return dict(pdf_url=url, technique='publisher-href')
- elif url.startswith('/'):
- return dict(pdf_url=host_prefix+url, technique='publisher-href')
+ if url.startswith("http"):
+ return dict(pdf_url=url, technique="publisher-href")
+ elif url.startswith("/"):
+ return dict(pdf_url=host_prefix + url, technique="publisher-href")
# protocols.io
# https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6
if "://www.protocols.io/view/" in html_url and not html_url.endswith(".pdf"):
url = html_url + ".pdf"
- return dict(pdf_url=url, technique='protocolsio-url')
+ return dict(pdf_url=url, technique="protocolsio-url")
# degruyter.com
# https://www.degruyter.com/view/books/9783486594621/9783486594621-009/9783486594621-009.xml
if "://www.degruyter.com/view/" in html_url and html_url.endswith(".xml"):
- url = html_url.replace('/view/', '/downloadpdf/').replace('.xml', '.pdf')
- return dict(pdf_url=url, technique='degruyter-url')
+ url = html_url.replace("/view/", "/downloadpdf/").replace(".xml", ".pdf")
+ return dict(pdf_url=url, technique="degruyter-url")
# journals.lww.com (Wolters Kluwer)
# https://journals.lww.com/spinejournal/Abstract/publishahead/Making_the_Most_of_Systematic_Reviews_and.94318.aspx
@@ -192,116 +217,141 @@ def extract_fulltext_url(html_url, html_body):
# we never get the content.
if "://journals.lww.com/" in html_url and False:
# data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
- for line in html_body.split(b'\n'):
+ for line in html_body.split(b"\n"):
if b"data-pdf-url=" in line:
- line = line.decode('utf-8')
- url = line.strip().replace('data-pdf-url=', '').replace('"', '')
- if url.startswith('http') and 'pdfs.journals.lww.com' in url:
- return dict(pdf_url=url, technique='journals.lww.com-jsvar')
+ line = line.decode("utf-8")
+ url = line.strip().replace("data-pdf-url=", "").replace('"', "")
+ if url.startswith("http") and "pdfs.journals.lww.com" in url:
+ return dict(pdf_url=url, technique="journals.lww.com-jsvar")
# www.ahajournals.org
# https://www.ahajournals.org/doi/10.1161/circ.110.19.2977
- if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url:
+ if "://www.ahajournals.org/doi/" in html_url and "/doi/pdf/" not in html_url:
# <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a>
- if b'/doi/pdf/10.' in html_body:
- url = html_url.replace('/doi/10.', '/doi/pdf/10.')
+ if b"/doi/pdf/10." in html_body:
+ url = html_url.replace("/doi/10.", "/doi/pdf/10.")
url = url + "?download=true"
- return dict(pdf_url=url, technique='ahajournals-url')
+ return dict(pdf_url=url, technique="ahajournals-url")
# ehp.niehs.nih.gov
# https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709
# https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51
if "://ehp.niehs.nih.gov/doi/" in html_url:
# <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
- if b'/doi/pdf/10.' in html_body:
- url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
- return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
+ if b"/doi/pdf/10." in html_body:
+ url = html_url.replace("/doi/full/10.", "/doi/pdf/10.").replace(
+ "/doi/10.", "/doi/pdf/10."
+ )
+ return dict(pdf_url=url, technique="ehp.niehs.nigh.gov-url")
# cogentoa.com
# https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
- if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url:
+ if "://www.cogentoa.com/article/" in html_url and ".pdf" not in html_url:
# blech, it's a SPA! All JS
# https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf
url = html_url + ".pdf"
- return dict(pdf_url=url, technique='cogentoa-url')
+ return dict(pdf_url=url, technique="cogentoa-url")
# chemrxiv.org (likely to be other figshare domains also)
# https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419
- if "://chemrxiv.org/articles/" in html_url or '.figshare.org/articles/' in html_url:
+ if "://chemrxiv.org/articles/" in html_url or ".figshare.org/articles/" in html_url:
# <script id="app-data" type="text/json"> [...] </script>
- json_tag = soup.find('script', id="app-data", attrs={"type": "text/json"})
+ json_tag = soup.find("script", id="app-data", attrs={"type": "text/json"})
if json_tag and json_tag.string:
app_data = json.loads(json_tag.string)
# "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf"
- url = app_data.get('article', {}).get('exportPdfDownloadUrl')
- if url and url.startswith('http'):
- return dict(pdf_url=url, technique='figshare-json')
+ url = app_data.get("article", {}).get("exportPdfDownloadUrl")
+ if url and url.startswith("http"):
+ return dict(pdf_url=url, technique="figshare-json")
# CNKI COVID-19 landing pages
# http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ
- if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url:
+ if "://en.gzbd.cnki.net/KCMS/detail/detail.aspx" in html_url:
# <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&amp;tablename=GZBJLAST2020&amp;dflag=pdfdown&#xA; "><i></i>PDF Download</a>
- href = soup.find('a', attrs={"id":"pdfDown"})
+ href = soup.find("a", attrs={"id": "pdfDown"})
if href:
- url = href['href'].strip().replace('&#xA;', '')
- if not url.startswith('http'):
+ url = href["href"].strip().replace("&#xA;", "")
+ if not url.startswith("http"):
url = host_prefix + url
- return dict(pdf_url=url, technique='cnki-href')
+ return dict(pdf_url=url, technique="cnki-href")
# RWTH AACHEN repository
- if '://publications.rwth-aachen.de/record/' in html_url:
- record_id = html_url.split('/')[-1]
+ if "://publications.rwth-aachen.de/record/" in html_url:
+ record_id = html_url.split("/")[-1]
url = f"{html_url}/files/{record_id}.pdf"
- if record_id.isdigit() and url.encode('utf-8') in html_body:
- return dict(pdf_url=url, technique='rwth-aachen-url')
+ if record_id.isdigit() and url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="rwth-aachen-url")
# physchemaspects.ru
- if '://physchemaspects.ru/' in html_url and soup:
- for href in soup.find_all('a'):
+ if "://physchemaspects.ru/" in html_url and soup:
+ for href in soup.find_all("a"):
if href.text == "download PDF file":
- url = href['href']
- if url.startswith('/'):
+ url = href["href"]
+ if url.startswith("/"):
url = host_prefix + url
- return dict(pdf_url=url, technique='physchemaspects-href')
+ return dict(pdf_url=url, technique="physchemaspects-href")
# OJS 3 (some)
if meta_generator and meta_generator.startswith("Open Journal Systems"):
- href = soup.find('a', attrs={"class":"obj_galley_link file"})
+ href = soup.find("a", attrs={"class": "obj_galley_link file"})
if href and href.text and "pdf" in href.text.lower():
- url = href['href'].strip()
- if url.startswith('/'):
+ url = href["href"].strip()
+ if url.startswith("/"):
url = host_prefix + url
- return dict(pdf_url=url, technique='ojs-galley-href')
+ return dict(pdf_url=url, technique="ojs-galley-href")
# ETH zurich e-periodica
- if '://www.e-periodica.ch/digbib/view' in html_url:
- url = html_url.replace('digbib/view', 'cntmng').split('#')[0]
- if url.encode('utf-8') in html_body:
- return dict(pdf_url=url, technique='href-eperiodica')
+ if "://www.e-periodica.ch/digbib/view" in html_url:
+ url = html_url.replace("digbib/view", "cntmng").split("#")[0]
+ if url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="href-eperiodica")
# JMIR
# https://mhealth.jmir.org/2020/7/e17891/
- if '.jmir.org/' in html_url and not "/pdf" in html_url and html_url.endswith("/"):
+ if ".jmir.org/" in html_url and "/pdf" not in html_url and html_url.endswith("/"):
url = html_url + "pdf"
- return dict(pdf_url=url, technique='jmir-url')
+ return dict(pdf_url=url, technique="jmir-url")
+
+ # Google Drive
+ # this is assuming it is a PDF
+ if "drive.google.com/file/d/" in html_url and "/view" in html_url:
+ gdrive_id = html_url.split("/")[5]
+ if len(gdrive_id) > 10:
+ # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24
+ return dict(
+ pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}",
+ technique="google-drive",
+ )
+
+ # https://doi.org/10.24850/j-tyca-14-4-7
+ # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150
+ if "docs.google.com/viewer?url=" in html_url:
+ original_url = html_url.split("?url=")[1]
+ if original_url:
+ return dict(pdf_url=original_url, technique="docs.google.com viewer")
### below here we are doing guesses
# generic guess: try current URL plus .pdf, if it exists in the HTML body
- if not '.pdf' in html_url:
+ if ".pdf" not in html_url:
url = html_url + ".pdf"
- if url.encode('utf-8') in html_body:
- return dict(pdf_url=url, technique='guess-url-plus-pdf')
+ if url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="guess-url-plus-pdf")
return dict()
-def test_regex():
+
+def test_regex() -> None:
lines = """
blah
var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
asdf"""
m = OVID_JOURNAL_URL_REGEX.search(lines)
- assert m.group(1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+ assert m
+ assert (
+ m.group(1)
+ == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+ )
lines = """
window.onload = function () {
@@ -311,4 +361,5 @@ def test_regex():
"""
url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
+ assert m
assert m.group(1) == url
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index f9f48a6..1e2d197 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,17 +1,15 @@
-
-import sys
import datetime
-from typing import List, Optional, Any, Tuple, Dict
+import sys
import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
+import braveblock
import dateparser
-from selectolax.parser import HTMLParser
import pydantic
-import braveblock
+from selectolax.parser import HTMLParser
from sandcrawler.misc import url_fuzzy_equal
-
# this is a map of metadata keys to CSS selectors
# sources for this list include:
# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
@@ -22,7 +20,7 @@ from sandcrawler.misc import url_fuzzy_equal
# order of these are mostly by preference/quality (best option first), though
# also/sometimes re-ordered for lookup efficiency (lookup stops after first
# match)
-HEAD_META_PATTERNS: Any = {
+HEAD_META_PATTERNS: Dict[str, List[str]] = {
"title": [
"meta[name='citation_title']",
"meta[name='eprints.title']",
@@ -159,7 +157,7 @@ HEAD_META_PATTERNS: Any = {
],
}
-HEAD_META_LIST_PATTERNS: Any = {
+HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = {
"contrib_names": [
"meta[name='citation_author']",
"meta[name='bepress_citation_author']",
@@ -180,7 +178,7 @@ HEAD_META_LIST_PATTERNS: Any = {
],
}
-XML_FULLTEXT_PATTERNS: List[dict] = [
+XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "meta[name='citation_xml_url']",
"attr": "content",
@@ -209,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
"technique": "SciElo XML link",
},
{
- "in_doc_url": "/article/view/",
+ "in_doc_url": "/view/",
"in_fulltext_url": "viewXML",
"selector": "a[class='obj_galley_link']",
"attr": "href",
@@ -222,9 +220,17 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
"technique": "ARPHA XML link",
"example_page": "https://zookeys.pensoft.net/article/26391",
},
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "xml",
+ "selector": "a.download-files-nlm",
+ "attr": "href",
+ "technique": "XML (NLM) download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
]
-HTML_FULLTEXT_PATTERNS: List[dict] = [
+HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "meta[name='citation_fulltext_html_url']",
"attr": "content",
@@ -249,11 +255,36 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
"attr": "href",
"technique": "dovepress fulltext link",
},
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+]
+
+COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "in_doc_url": "pensoft.net/article/", # also /element/
+ "in_fulltext_url": "/download/fig/",
+ "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small",
+ "attr": "href",
+ "technique": "Active figure download link (zookeys)",
+ "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/",
+ },
+ {
+ "in_doc_url": "/file.xhtml?persistentId",
+ "in_fulltext_url": "/access/datafile/",
+ "selector": "div.form-group code",
+ "use_body": "true",
+ "technique": "Dataverse 'download URL'",
+ "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0",
+ },
]
# This is a database of matching patterns. Most of these discovered by hand,
# looking at OA journal content that failed to craw/ingest.
-PDF_FULLTEXT_PATTERNS: List[dict] = [
+PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "head meta[name='citation_pdf_url']",
"attr": "content",
@@ -272,7 +303,7 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx",
},
{
- "selector": "head meta[propery='citation_pdf_url']",
+ "selector": "head meta[property='citation_pdf_url']",
"attr": "content",
"technique": "citation_pdf_url",
# eg, researchgate
@@ -300,10 +331,10 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
},
{
- "in_doc_url": "/article/view/",
+ "in_doc_url": "/view/",
"selector": "a#pdfDownloadLink",
"attr": "href",
- "technique": "pdfDownloadLink link",
+ "technique": "OJS pdfDownloadLink link",
"example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
},
{
@@ -375,16 +406,371 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"technique": "PDF URL link",
"example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439",
},
+ {
+ "in_doc_url": "degruyter.com/document/",
+ "in_fulltext_url": "/pdf",
+ "selector": "a.downloadPdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
+ },
+ {
+ "in_doc_url": "repositorio.unicamp.br/handle/",
+ "in_fulltext_url": "/bitstream/",
+ "selector": "table.panel-body a[target='_blank']",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+ },
+ {
+ "in_doc_url": "dlc.library.columbia.edu/durst/",
+ "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+ "attr": "href",
+ "technique": "Access URL link",
+ "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+ },
+ {
+ "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+ "in_fulltext_url": "pdf",
+ "selector": "p a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+ },
+ {
+ "in_doc_url": "preprints.jmir.org/preprint/",
+ "selector": "a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://preprints.jmir.org/preprint/22556",
+ },
+ {
+ "in_doc_url": "bloomsburycollections.com/",
+ "in_fulltext_url": "pdf",
+ "selector": "li.download-item a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+ },
+ {
+ "in_doc_url": "emerald.com/insight/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.intent_pdf_link",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+ },
+ {
+ "in_doc_url": "ingentaconnect.com/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[data-popup]",
+ "attr": "data-popup",
+ "technique": "PDF URL link",
+ "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+ },
+ {
+ "in_doc_url": "library.wur.nl/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.wl_full_text_restricted",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922",
+ },
+ {
+ "in_doc_url": "/dlibra/",
+ "in_fulltext_url": "pdf",
+ "selector": "iframe#js-main-frame",
+ "attr": "src",
+ "technique": "PDF iframe (dlibra)",
+ "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031",
+ },
+ {
+ "in_doc_url": "/handle/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.misc table.inner tr.b a",
+ "attr": "href",
+ "technique": "PDF URL link (DSpace, first file)",
+ "example_page": "https://orbi.uliege.be/handle/2268/174200",
+ },
+ {
+ "in_doc_url": "/publications/",
+ "in_fulltext_url": "pdf",
+ "selector": ".publication-sidebar li.open-access a.document-link",
+ "attr": "href",
+ "technique": "PDF URL link (Pure repo, OA link)",
+ "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance",
+ },
+ {
+ "in_doc_url": "//hal",
+ "selector": ".widget-openaccess .widget-content a",
+ "attr": "href",
+ "technique": "Fulltext OA URL (HAL)",
+ "example_page": "https://hal.archives-ouvertes.fr/hal-00744951",
+ },
+ {
+ "in_doc_url": "/record/",
+ "in_fulltext_url": "pdf",
+ "selector": "#detailedrecordminipanelfile a",
+ "attr": "href",
+ "technique": "PDF URL link (Invenio)",
+ "example_page": "https://bib-pubdb1.desy.de/record/416556",
+ },
+ {
+ "in_doc_url": "/available/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.file-table a",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/",
+ },
+ {
+ "in_doc_url": "/islandora/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.islandora-pdf-link",
+ "attr": "href",
+ "technique": "PDF URL link (Islandora)",
+ "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804",
+ },
+ {
+ "in_doc_url": "/receive/",
+ "in_fulltext_url": "pdf",
+ "selector": ".mir-preview noscript a",
+ "attr": "href",
+ "technique": "PDF iframe via noscript (MyCoRe)",
+ "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191",
+ },
+ {
+ "in_doc_url": "/registro.do",
+ "in_fulltext_url": "imagenes",
+ "selector": ".resumen_bib a[data-analytics=media]",
+ "attr": "href",
+ "technique": "Media link (DIGIBIS)",
+ "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740",
+ },
+ {
+ "in_doc_url": "/view",
+ "in_fulltext_url": "/at_download/",
+ "selector": ".documentContent #content a",
+ "attr": "href",
+ "technique": "Media link (Plone)",
+ "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view",
+ },
+ {
+ "in_doc_url": "isca-speech.org/",
+ "in_fulltext_url": "pdf",
+ "selector": ".w3-container a",
+ "attr": "href",
+ "technique": "PDF URL link (isca-speech.org)",
+ "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html",
+ },
+ {
+ "in_doc_url": "://repository.dri.ie/",
+ "in_fulltext_url": "/download",
+ "selector": "#dri_download_assets > div > a",
+ "attr": "href",
+ "technique": "Download link (repository.dri.ie)",
+ "example_page": "https://repository.dri.ie/catalog/qf8621102",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.download-files-pdf",
+ "attr": "href",
+ "technique": "PDF Download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+ {
+ "in_doc_url": "cureus.com/",
+ "in_fulltext_url": "pdf",
+ "selector": ".small-medium-pdf a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF Download link (cureus.com)",
+ "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks",
+ },
+ {
+ "in_doc_url": "e-manuscripta.ch/",
+ "in_fulltext_url": "pdf",
+ "selector": "#titleinfoPdfDownload a.resourceLink",
+ "attr": "href",
+ "technique": "PDF Download link (e-manuscripta.ch)",
+ "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176",
+ },
+ {
+ "in_doc_url": "journals.uchicago.edu",
+ "in_fulltext_url": "pdf",
+ "selector": "nav.article__navbar a.ctrl--pdf",
+ "attr": "href",
+ "technique": "PDF Download link (journals.uchicago.edu)",
+ "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
+ },
+ {
+ "in_doc_url": "integrityresjournals.org",
+ "in_fulltext_url": "/article-full-text-pdf/",
+ "selector": "a[target='_blank'].btn-danger",
+ "attr": "href",
+ "technique": "PDF Download link (integrityresjournals.org)",
+ "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body.pkp_page_article a.download",
+ "attr": "href",
+ "technique": "OJS PDF Embed",
+ "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "/article/",
+ "selector": "a.pdf",
+ "attr": "href",
+ "technique": "OJS PDF link",
+ },
+ {
+ "in_doc_url": "scitemed.com/article/",
+ "in_fulltext_url": ".pdf",
+ "selector": "li.tab_pdf_btn a",
+ "attr": "href",
+ "technique": "PDF link (scitemed.com)",
+ },
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+ {
+ "in_doc_url": "/jvi.aspx",
+ "in_fulltext_url": "download_fulltext",
+ "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item",
+ "attr": "href",
+ "technique": "erciyesmedj.com publication system PDF download link",
+ },
+ {
+ "selector": "body embed[alt='pdf']",
+ "attr": "src",
+ "technique": "embed PDF",
+ "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913",
+ },
+ {
+ "in_fulltext_url": "viewPDFInterstitial",
+ "in_doc_url": "/view/",
+ "selector": "frameset frame",
+ "attr": "src",
+ "technique": "PDF iframe (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ # note this one has a special handler
+ "in_doc_url": "viewPDFInterstitial",
+ "in_fulltext_url": "://",
+ "selector": "head meta[http-equiv='refresh']",
+ "attr": "content",
+ "technique": "HTML meta refresh (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ "in_doc_url": "dlib.si/details/",
+ "in_fulltext_url": "PDF",
+ "selector": "body #FilesBox a",
+ "attr": "href",
+ "technique": "dlib.si download links",
+ "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ",
+ },
+ {
+ "in_doc_url": "filclass.ru",
+ "in_fulltext_url": "pdf",
+ "selector": "main .pdf-article a.pdficon",
+ "attr": "href",
+ "technique": "filclass.ru PDF link",
+ "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
+ },
+ {
+ "in_doc_url": "cdnsciencepub.com",
+ "in_fulltext_url": "pdf",
+ "selector": "article .info-panel a.btn--pdf",
+ "attr": "href",
+ "technique": "cdnsciencepub.com PDF link",
+ "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011",
+ },
+ {
+ "in_doc_url": "grrjournal.com",
+ "in_fulltext_url": "pdf",
+ "selector": ".ereaders-main-section a[download]",
+ "attr": "href",
+ "technique": "grrjournal.com PDF link",
+ "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "pdf",
+ "selector": "#articleFullText a.remote_pdf",
+ "attr": "href",
+ "technique": "OJS remote_pdf link",
+ "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/abs/",
+ "in_fulltext_url": "/reader/",
+ "selector": "article.container .single__download a",
+ "attr": "href",
+ "technique": "worldscientific landing pages",
+ "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/",
+ "in_fulltext_url": "/pdf/",
+ "selector": "noscript a[target='_blank']",
+ "attr": "href",
+ "technique": "worldscientific reader",
+ "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": ".container .view-content .download-article a",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": "body a.download-pdf",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/view/",
+ "selector": "body .entry_details a.pdf",
+ "attr": "href",
+ "technique": "generic OJS/preprints",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body header a.download",
+ "attr": "href",
+ "technique": "generic OJS/preprints PDF Embed",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327",
+ },
]
-FULLTEXT_URL_PATTERNS_SKIP = [
+FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
# wiley has a weird almost-blank page we don't want to loop on
- "://onlinelibrary.wiley.com/doi/pdf/"
- "://doi.org/"
- "://dx.doi.org/"
+ "://onlinelibrary.wiley.com/doi/pdf/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "{'embed': '",
]
-RELEASE_TYPE_MAP = {
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+ "javascript:",
+ "about:",
+]
+
+RELEASE_TYPE_MAP: Dict[str, str] = {
"research article": "article-journal",
"text.serial.journal": "article-journal",
}
@@ -426,14 +812,15 @@ class BiblioMetadata(pydantic.BaseModel):
pdf_fulltext_url: Optional[str]
html_fulltext_url: Optional[str]
xml_fulltext_url: Optional[str]
+ component_url: Optional[str]
class Config:
- json_encoders = {
- datetime.date: lambda dt: dt.isoformat()
- }
+ json_encoders = {datetime.date: lambda dt: dt.isoformat()}
-def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
+def html_extract_fulltext_url(
+ doc_url: str, doc: HTMLParser, patterns: List[dict]
+) -> Optional[Tuple[str, str]]:
"""
Tries to quickly extract fulltext URLs using a set of patterns. This
function is intendend to be generic across various extraction techniques.
@@ -442,49 +829,74 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
"""
self_doc_url: Optional[Tuple[str, str]] = None
for pattern in patterns:
- if not 'selector' in pattern:
+ if "selector" not in pattern:
continue
- if 'in_doc_url' in pattern:
- if not pattern['in_doc_url'] in doc_url:
+ if "in_doc_url" in pattern:
+ if pattern["in_doc_url"] not in doc_url:
continue
- elem = doc.css_first(pattern['selector'])
+ elem = doc.css_first(pattern["selector"])
if not elem:
continue
- if 'attr' in pattern:
- val = elem.attrs.get(pattern['attr'])
- if not val:
+ val = None
+ if "attr" in pattern:
+ val = elem.attrs.get(pattern["attr"])
+ # handle HTML redirect
+ if val and pattern["attr"] == "content" and "URL=" in val:
+ val = val.split("URL=")[1]
+ elif pattern.get("use_body"):
+ val = elem.text()
+ if "://" not in val:
continue
- val = urllib.parse.urljoin(doc_url, val)
- assert val
- if 'in_fulltext_url' in pattern:
- if not pattern['in_fulltext_url'] in val:
- continue
- for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
- if skip_pattern in val.lower():
- continue
- if url_fuzzy_equal(doc_url, val):
- # don't link to self, unless no other options
- self_doc_url = (val, pattern.get('technique', 'unknown'))
+ if not val:
+ continue
+ val = urllib.parse.urljoin(doc_url, val)
+ assert val
+ if "in_fulltext_url" in pattern:
+ if pattern["in_fulltext_url"] not in val:
continue
- return (val, pattern.get('technique', 'unknown'))
+ skip_matched = False
+ for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
+ if skip_pattern in val.lower():
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+ if val.lower().startswith(skip_pattern):
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ if url_fuzzy_equal(doc_url, val):
+ # don't link to self, unless no other options
+ self_doc_url = (val, pattern.get("technique", "unknown"))
+ continue
+
+ # quirks modes / hacks
+ if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"):
+ val = val[:-1]
+
+ return (val, pattern.get("technique", "unknown"))
if self_doc_url:
- print(f" WARN: returning fulltext URL pointing to self", file=sys.stderr)
+ print(" WARN: returning fulltext URL pointing to self", file=sys.stderr)
return self_doc_url
return None
+
def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
meta: Any = dict()
head = doc.css_first("head")
if not head:
+ print(f"WARN: empty <head>? {doc_url}", file=sys.stderr)
return None
for field, patterns in HEAD_META_PATTERNS.items():
for pattern in patterns:
val = head.css_first(pattern)
- #print((field, pattern, val))
- if val and 'content' in val.attrs and val.attrs['content']:
- meta[field] = val.attrs['content']
+ # print((field, pattern, val))
+ if val and "content" in val.attrs and val.attrs["content"]:
+ meta[field] = val.attrs["content"]
break
for field, patterns in HEAD_META_LIST_PATTERNS.items():
@@ -492,53 +904,57 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
val_list = head.css(pattern)
if val_list:
for val in val_list:
- if 'content' in val.attrs and val.attrs['content']:
- if not field in meta:
+ if "content" in val.attrs and val.attrs["content"]:
+ if field not in meta:
meta[field] = []
- meta[field].append(val.attrs['content'])
+ meta[field].append(val.attrs["content"])
break
# (some) fulltext extractions
pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS)
if pdf_fulltext_url:
- meta['pdf_fulltext_url'] = pdf_fulltext_url[0]
+ meta["pdf_fulltext_url"] = pdf_fulltext_url[0]
xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
if xml_fulltext_url:
- meta['xml_fulltext_url'] = xml_fulltext_url[0]
+ meta["xml_fulltext_url"] = xml_fulltext_url[0]
html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
if html_fulltext_url:
- meta['html_fulltext_url'] = html_fulltext_url[0]
+ meta["html_fulltext_url"] = html_fulltext_url[0]
+ component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS)
+ if component_url:
+ meta["component_url"] = component_url[0]
# TODO: replace with clean_doi() et al
- if meta.get('doi') and meta.get('doi').startswith('doi:'):
- meta['doi'] = meta['doi'][4:]
+ if meta.get("doi") and meta.get("doi").startswith("doi:"):
+ meta["doi"] = meta["doi"][4:]
- raw_identifiers = meta.pop('raw_identifiers', [])
+ raw_identifiers = meta.pop("raw_identifiers", [])
for ident in raw_identifiers:
- if ident.startswith('doi:10.'):
- if not 'doi' in meta:
- meta['doi'] = ident.replace('doi:', '')
- elif ident.startswith('10.') and '/' in ident:
- if not 'doi' in meta:
- meta['doi'] = ident
- elif ident.startswith('isbn:'):
- if not 'isbn' in meta:
- meta['isbn'] = ident.replace('isbn:', '')
-
- raw_date = meta.pop('raw_date', None)
+ if ident.startswith("doi:10."):
+ if "doi" not in meta:
+ meta["doi"] = ident.replace("doi:", "")
+ elif ident.startswith("10.") and "/" in ident:
+ if "doi" not in meta:
+ meta["doi"] = ident
+ elif ident.startswith("isbn:"):
+ if "isbn" not in meta:
+ meta["isbn"] = ident.replace("isbn:", "")
+
+ raw_date = meta.pop("raw_date", None)
if raw_date:
parsed = dateparser.parse(raw_date)
if parsed:
- meta['release_date'] = parsed.date()
+ meta["release_date"] = parsed.date()
- raw_release_type = meta.pop('raw_release_type', None)
+ raw_release_type = meta.pop("raw_release_type", None)
if raw_release_type:
release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip())
if release_type:
- meta['release_type'] = release_type
+ meta["release_type"] = release_type
return BiblioMetadata(**meta)
+
def load_adblock_rules() -> braveblock.Adblocker:
"""
TODO: consider blocking very generic assets:
@@ -561,46 +977,67 @@ def load_adblock_rules() -> braveblock.Adblocker:
"||pbs.twimg.com^",
"||badge.dimensions.ai^",
"||recaptcha.net^",
-
+ "||tag.imagino.com^",
+ "||consent.cookiebot.com^",
+ "||recaptcha.net^",
# not sure about these CC badges (usually via a redirect)
- #"||licensebuttons.net^",
- #"||i.creativecommons.org^",
-
+ # "||licensebuttons.net^",
+ # "||i.creativecommons.org^",
# Should we skip jquery, or other generic javascript CDNs?
- #"||code.jquery.com^",
- #"||ajax.googleapis.com^",
- #"||cdnjs.cloudflare.com^",
-
+ # "||code.jquery.com^",
+ # "||ajax.googleapis.com^",
+ # "||cdnjs.cloudflare.com^",
# badges, "share" buttons, tracking, etc
"apis.google.com/js/plusone",
"www.google.com/recaptcha/",
"js/_getUACode.js"
-
# PLOS images
"/resource/img/icon.*.16.png^",
+ # CAIRN broken tracking tag
+ "cairn-int.info//about.php?cairn_guest=",
],
)
-def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list:
+def _extract_generic(
+ doc: HTMLParser, selector: str, attrs: List[str], type_name: str
+) -> List[Dict[str, str]]:
resources = []
for node in doc.css(selector):
for attr in attrs:
- if not attr in node.attrs:
+ if attr not in node.attrs:
continue
url = node.attrs.get(attr)
+ # special-case a couple meta URI prefixes which don't match with adblock rules
+ skip = False
+ for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]:
+ if url and url.startswith(prefix):
+ skip = True
+ break
+ if url and "/" not in url and "." not in url and " " in url:
+ # eg: "Ce fichier n'existe pas"
+ skip = True
+ if skip:
+ continue
+ if url and url.startswith("https://https://"):
+ url = url[8:]
+ elif url and url.startswith("http://http://"):
+ url = url[7:]
if url:
- resources.append(dict(url=url, type=type_name))
+ # print(url, file=sys.stderr)
+ resources.append(dict(url=url.strip(), type=type_name))
return resources
-def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list:
+def html_extract_resources(
+ doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker
+) -> List[Dict[str, str]]:
"""
This function tries to find all the important resources in a page. The
presumption is that the HTML document is article fulltext, and we want the
- list of all resoures (by URL) necessary to replay the page.
+ list of all resources (by URL) necessary to replay the page.
The returned resource URLs each have a type (script, img, css, etc), and
should be fully-qualified URLs (not relative).
@@ -624,13 +1061,17 @@ def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Ad
# ensure URLs are absolute
for r in resources:
- r['url'] = urllib.parse.urljoin(doc_url, r['url'])
+ r["url"] = urllib.parse.urljoin(doc_url, r["url"])
# filter using adblocker
- resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False]
+ resources = [
+ r
+ for r in resources
+ if adblock.check_network_urls(r["url"], source_url=doc_url, request_type=r["type"])
+ is False
+ ]
# remove duplicates
resources = [dict(t) for t in {tuple(d.items()) for d in resources}]
return resources
-
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 806f1e7..3ab4971 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1,31 +1,31 @@
-
# XXX: some broken MRO thing going on in here due to python3 object wrangling
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
+import datetime
+import gzip
+import http.client
+import json
import os
import sys
import time
-import gzip
-import json
-import requests
-import datetime
import urllib.parse
-import urllib3.exceptions
-from typing import Tuple
from collections import namedtuple
+from http.client import IncompleteRead
+from typing import Any, Dict, List, Optional, Tuple, Union
-import http.client
+import requests
+import urllib3.exceptions
# not sure this will really work. Should go before wayback imports.
http.client._MAXHEADERS = 1000 # type: ignore
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory3
+from wayback.resourcestore import ResourceStore
+
+from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session
-from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
class SandcrawlerBackoffError(Exception):
"""
@@ -34,62 +34,78 @@ class SandcrawlerBackoffError(Exception):
be passed up through any timeout/retry code and become an actual long pause
or crash.
"""
+
pass
-ResourceResult = namedtuple("ResourceResult", [
- "start_url",
- "hit",
- "status",
- "terminal_url",
- "terminal_dt",
- "terminal_status_code",
- "body",
- "cdx",
- "revisit_cdx",
-])
-
-WarcResource = namedtuple("WarcResource", [
- "status_code",
- "location",
- "body",
- "revisit_cdx",
-])
-
-CdxRow = namedtuple('CdxRow', [
- 'surt',
- 'datetime',
- 'url',
- 'mimetype',
- 'status_code',
- 'sha1b32',
- 'sha1hex',
- 'warc_csize',
- 'warc_offset',
- 'warc_path',
-])
-
-CdxPartial = namedtuple('CdxPartial', [
- 'surt',
- 'datetime',
- 'url',
- 'mimetype',
- 'status_code',
- 'sha1b32',
- 'sha1hex',
-])
-
-def cdx_partial_from_row(full):
+
+ResourceResult = namedtuple(
+ "ResourceResult",
+ [
+ "start_url",
+ "hit",
+ "status",
+ "terminal_url",
+ "terminal_dt",
+ "terminal_status_code",
+ "body",
+ "cdx",
+ "revisit_cdx",
+ ],
+)
+
+WarcResource = namedtuple(
+ "WarcResource",
+ [
+ "status_code",
+ "location",
+ "body",
+ "revisit_cdx",
+ ],
+)
+
+CdxRow = namedtuple(
+ "CdxRow",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ "warc_csize",
+ "warc_offset",
+ "warc_path",
+ ],
+)
+
+CdxPartial = namedtuple(
+ "CdxPartial",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ ],
+)
+
+
+def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial:
return CdxPartial(
- surt=full.surt,
- datetime=full.datetime,
- url=full.url,
- mimetype=full.mimetype,
- status_code=full.status_code,
- sha1b32=full.sha1b32,
- sha1hex=full.sha1hex,
+ surt=row.surt,
+ datetime=row.datetime,
+ url=row.url,
+ mimetype=row.mimetype,
+ status_code=row.status_code,
+ sha1b32=row.sha1b32,
+ sha1hex=row.sha1hex,
)
-def cdx_to_dict(cdx):
+
+def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]:
d = {
"surt": cdx.surt,
"datetime": cdx.datetime,
@@ -99,67 +115,82 @@ def cdx_to_dict(cdx):
"sha1b32": cdx.sha1b32,
"sha1hex": cdx.sha1hex,
}
- if type(cdx) == CdxRow and '/' in cdx.warc_path:
- d['warc_csize'] = cdx.warc_csize
- d['warc_offset'] = cdx.warc_offset
- d['warc_path'] = cdx.warc_path
+ if type(cdx) == CdxRow and "/" in cdx.warc_path:
+ d["warc_csize"] = cdx.warc_csize
+ d["warc_offset"] = cdx.warc_offset
+ d["warc_path"] = cdx.warc_path
return d
-def fuzzy_match_url(left, right):
+
+def fuzzy_match_url(left: str, right: str) -> bool:
"""
Matches URLs agnostic of http/https (and maybe other normalizations in the
future)
"""
if left == right:
return True
- if '://' in left and '://' in right:
- left = '://'.join(left.split('://')[1:])
- right = '://'.join(right.split('://')[1:])
+ if "://" in left and "://" in right:
+ left = "://".join(left.split("://")[1:])
+ right = "://".join(right.split("://")[1:])
if left == right:
return True
if left == right + "/" or right == left + "/":
return True
+ if left.replace("//", "/") == right.replace("//", "/"):
+ return True
return False
-def test_fuzzy_match_url():
- assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True
- assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True
- assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
+
+def test_fuzzy_match_url() -> None:
+ assert fuzzy_match_url("http://thing.com", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "https://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
+ assert (
+ fuzzy_match_url(
+ "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png",
+ "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png",
+ )
+ is True
+ )
# should probably handle these?
- assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
- assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
- assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False
+ assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") is False
+
class CdxApiError(Exception):
pass
-class CdxApiClient:
- def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs):
+class CdxApiClient:
+ def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- cdx_auth_token = kwargs.get('cdx_auth_token',
- os.environ.get('CDX_AUTH_TOKEN'))
+ cdx_auth_token = kwargs.get("cdx_auth_token", os.environ.get("CDX_AUTH_TOKEN"))
if not cdx_auth_token:
- raise Exception("CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)")
- self.http_session.headers.update({
- 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
- 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
- })
+ raise Exception(
+ "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)"
+ )
+ self.http_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.CdxApiClient",
+ "Cookie": "cdx_auth_token={}".format(cdx_auth_token),
+ }
+ )
- def _query_api(self, params):
+ def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]:
"""
Hits CDX API with a query, parses result into a list of CdxRow
"""
resp = self.http_session.get(self.host_url, params=params)
if resp.status_code != 200:
raise CdxApiError(resp.text)
- #print(resp.url, file=sys.stderr)
+ # print(resp.url, file=sys.stderr)
if not resp.text:
return None
rj = resp.json()
@@ -180,8 +211,17 @@ class CdxApiClient:
else:
status_code = int(raw[4])
- # CDX rows with no WARC records?
- if raw[8] == '-' or raw[9] == '-' or raw[10] == '-':
+ # remove CDX rows with no WARC records (?)
+ if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":
+ continue
+
+ # remove CDX rows with SHA256 (not SHA1) digests
+ if raw[5].startswith("sha-256"):
+ continue
+
+ # remove CDX rows with 'error' digests
+ # TODO: follow-up on this (2022-11-01 sandcrawler errors)
+ if raw[5].lower() == "error":
continue
row = CdxRow(
@@ -200,23 +240,31 @@ class CdxApiClient:
rows.append(row)
return rows
- def fetch(self, url, datetime, filter_status_code=None, retry_sleep=None):
+ def fetch(
+ self,
+ url: str,
+ datetime: str,
+ filter_status_code: Optional[int] = None,
+ retry_sleep: Optional[int] = None,
+ ) -> CdxRow:
"""
Fetches a single CDX row by url/datetime. Raises a KeyError if not
found, because we expect to be looking up a specific full record.
"""
if len(datetime) != 14:
- raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
- params = {
- 'url': url,
- 'from': datetime,
- 'to': datetime,
- 'matchType': 'exact',
- 'limit': 1,
- 'output': 'json',
+ raise ValueError(
+ "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)
+ )
+ params: Dict[str, str] = {
+ "url": url,
+ "from": datetime,
+ "to": datetime,
+ "matchType": "exact",
+ "limit": "1",
+ "output": "json",
}
if filter_status_code:
- params['filter'] = "statuscode:{}".format(filter_status_code)
+ params["filter"] = "statuscode:{}".format(filter_status_code)
resp = self._query_api(params)
if not resp:
if retry_sleep and retry_sleep > 0:
@@ -224,23 +272,43 @@ class CdxApiClient:
if retry_sleep > 3:
next_sleep = retry_sleep - 3
retry_sleep = 3
- print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
time.sleep(retry_sleep)
- return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep)
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep
+ )
raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
row = resp[0]
# allow fuzzy http/https match
if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
if retry_sleep and retry_sleep > 0:
- print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
time.sleep(retry_sleep)
- return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
- raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row))
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=None
+ )
+ raise KeyError(
+ "Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(
+ url, datetime, row
+ )
+ )
if filter_status_code:
assert row.status_code == filter_status_code
return row
- def lookup_best(self, url, max_age_days=None, best_mimetype=None, closest=None):
+ def lookup_best(
+ self,
+ url: str,
+ max_age_days: Optional[int] = None,
+ best_mimetype: Optional[str] = None,
+ closest: Union[datetime.datetime, str, None] = None,
+ ) -> Optional[CdxRow]:
"""
Fetches multiple CDX rows for the given URL, tries to find the most recent.
@@ -263,42 +331,50 @@ class CdxApiClient:
most-recent
"""
- params = {
- 'url': url,
- 'matchType': 'exact',
- 'limit': -25,
- 'output': 'json',
+ params: Dict[str, str] = {
+ "url": url,
+ "matchType": "exact",
+ "limit": "-40",
+ "output": "json",
# Collapsing seems efficient, but is complex; would need to include
# other filters and status code in filter
#'collapse': 'timestamp:6',
-
# Revisits now allowed and resolved!
#'filter': '!mimetype:warc/revisit',
}
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
- params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
+ params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day)
+ closest_dt = "00000000"
if closest:
- params['closest'] = closest
- params['sort'] = "closest"
- #print(params, file=sys.stderr)
+ if isinstance(closest, datetime.datetime):
+ closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ params["closest"] = closest_dt
+ else:
+ closest_dt = closest
+ params["closest"] = closest_dt
+ params["sort"] = "closest"
+ # print(params, file=sys.stderr)
rows = self._query_api(params)
if not rows:
return None
- def _cdx_sort_key(r):
+ def _cdx_sort_key(r: CdxRow) -> tuple:
"""
This is a function, not a lambda, because it captures
best_mimetype. Will create a tuple that can be used to sort in
*reverse* order.
"""
return (
+ int(r.url == url),
int(r.status_code in (200, 226)),
int(0 - (r.status_code or 999)),
int(r.mimetype == best_mimetype),
int(r.mimetype != "warc/revisit"),
- int('/' in r.warc_path),
+ r.datetime[:4] == closest_dt[:4],
int(r.datetime),
+ # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime
+ int("/" in r.warc_path),
)
rows = sorted(rows, key=_cdx_sort_key)
@@ -308,39 +384,48 @@ class CdxApiClient:
class WaybackError(Exception):
pass
+
class WaybackContentError(Exception):
pass
+
class PetaboxError(Exception):
pass
+
class NoCaptureError(Exception):
pass
-class WaybackClient:
- def __init__(self, cdx_client=None, **kwargs):
+class WaybackClient:
+ def __init__(self, cdx_client: Optional[CdxApiClient] = None, **kwargs):
if cdx_client:
self.cdx_client = cdx_client
else:
self.cdx_client = CdxApiClient()
# /serve/ instead of /download/ doesn't record view count
# this *does* want to be http://, not https://
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
self.petabox_webdata_secret = kwargs.get(
- 'petabox_webdata_secret',
- os.environ.get('PETABOX_WEBDATA_SECRET'),
+ "petabox_webdata_secret",
+ os.environ.get("PETABOX_WEBDATA_SECRET"),
)
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix", "https://archive.org/serve/")
self.rstore = None
self.max_redirects = 25
self.wayback_endpoint = "https://web.archive.org/web/"
self.replay_headers = {
- 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient',
+ "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient",
}
+ self.http_session = requests_retry_session()
+ self.record_http_session = requests_retry_session(
+ status_forcelist=[],
+ )
- def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True):
+ def fetch_petabox(
+ self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True
+ ) -> WarcResource:
"""
Fetches wayback resource directly from petabox using WARC path/offset/csize.
@@ -363,33 +448,56 @@ class WaybackClient:
"""
if not self.petabox_webdata_secret:
raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
- if not "/" in warc_path:
- raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
+ if "/" not in warc_path:
+ raise ValueError(
+ "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)
+ )
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory3(
- webdata_secret=self.petabox_webdata_secret,
- ))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory3(
+ webdata_secret=self.petabox_webdata_secret,
+ )
+ )
+ assert self.rstore
try:
- #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
+ # print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
except wayback.exception.ResourceUnavailable:
print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
- raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ )
except wayback.exception.InvalidResource:
print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
- raise WaybackContentError("failed to load file contents from wayback/petabox (InvalidResource)")
+ raise WaybackContentError(
+ "failed to load file contents from wayback/petabox (InvalidResource)"
+ )
except urllib3.exceptions.ReadTimeoutError as rte:
- raise PetaboxError("failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(rte))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(
+ rte
+ )
+ )
except ValueError as ve:
- raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ValueError: {})".format(ve)
+ )
except EOFError as eofe:
- raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)
+ )
except TypeError as te:
- raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ )
+ )
except Exception as e:
if "while decompressing data: invalid block type" in str(e):
- raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files")
+ raise PetaboxError(
+ "decompression error fetching WARC record; usually due to bad alexa ARC files"
+ )
else:
raise e
# Note: could consider a generic "except Exception" here, as we get so
@@ -402,7 +510,11 @@ class WaybackClient:
raise WaybackContentError("too many HTTP headers (in wayback fetch)")
location = gwb_record.get_location() or None
- if status_code is None and gwb_record.target_uri.startswith(b"ftp://") and not gwb_record.is_revisit():
+ if (
+ status_code is None
+ and gwb_record.target_uri.startswith(b"ftp://")
+ and not gwb_record.is_revisit()
+ ):
# TODO: some additional verification here?
status_code = 226
@@ -413,16 +525,21 @@ class WaybackClient:
raise WaybackContentError("found revisit record, but won't resolve (loop?)")
revisit_uri, revisit_dt = gwb_record.refers_to
if not (revisit_uri and revisit_dt):
- raise WaybackContentError("revisit record missing URI and/or DT: warc:{} offset:{}".format(
- warc_path, offset))
+ raise WaybackContentError(
+ "revisit record missing URI and/or DT: warc:{} offset:{}".format(
+ warc_path, offset
+ )
+ )
# convert revisit_dt
# len("2018-07-24T11:56:49"), or with "Z"
assert len(revisit_dt) in (19, 20)
if type(revisit_uri) is bytes:
- revisit_uri = revisit_uri.decode('utf-8')
+ revisit_uri = revisit_uri.decode("utf-8")
if type(revisit_dt) is bytes:
- revisit_dt = revisit_dt.decode('utf-8')
- revisit_dt = revisit_dt.replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
+ revisit_dt = revisit_dt.decode("utf-8")
+ revisit_dt = (
+ revisit_dt.replace("-", "").replace(":", "").replace("T", "").replace("Z", "")
+ )
assert len(revisit_dt) == 14
try:
revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
@@ -440,10 +557,12 @@ class WaybackClient:
body = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
raise WaybackError(
- "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ )
+ )
elif status_code is None:
- raise WaybackContentError(
- "got a None status_code in (W)ARC record")
+ raise WaybackContentError("got a None status_code in (W)ARC record")
return WarcResource(
status_code=status_code,
location=location,
@@ -451,7 +570,14 @@ class WaybackClient:
revisit_cdx=revisit_cdx,
)
- def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True, expected_status_code=None):
+ def fetch_petabox_body(
+ self,
+ csize: int,
+ offset: int,
+ warc_path: str,
+ resolve_revisit: bool = True,
+ expected_status_code: Optional[int] = None,
+ ) -> bytes:
"""
Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
@@ -468,19 +594,22 @@ class WaybackClient:
if expected_status_code:
if expected_status_code != resource.status_code:
- raise KeyError("archived HTTP response (WARC) was not {}: {}".format(
- expected_status_code,
- resource.status_code,
+ raise KeyError(
+ "archived HTTP response (WARC) was not {}: {}".format(
+ expected_status_code,
+ resource.status_code,
)
)
elif resource.status_code not in (200, 226):
- raise KeyError("archived HTTP response (WARC) was not 200: {}".format(
- resource.status_code)
+ raise KeyError(
+ "archived HTTP response (WARC) was not 200: {}".format(resource.status_code)
)
return resource.body
- def fetch_replay_body(self, url, datetime, cdx_sha1hex=None):
+ def fetch_replay_body(
+ self, url: str, datetime: str, cdx_sha1hex: Optional[str] = None
+ ) -> bytes:
"""
Fetches an HTTP 200 record from wayback via the replay interface
(web.archive.org) instead of petabox.
@@ -501,46 +630,59 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = requests.get(
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except requests.exceptions.ConnectionError:
+ raise WaybackContentError("ConnectionError (wayback replay fetch)")
except requests.exceptions.ChunkedEncodingError:
raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
except UnicodeDecodeError:
- raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
-
- try:
- resp.raise_for_status()
- except Exception as e:
- raise WaybackError(str(e))
- #print(resp.url, file=sys.stderr)
+ raise WaybackContentError(
+ "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+ url
+ )
+ )
# defensively check that this is actually correct replay based on headers
- if not "X-Archive-Src" in resp.headers:
+ if "X-Archive-Src" not in resp.headers:
+ # check if this was an error first
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ # otherwise, a weird case (200/redirect but no Src header
raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
- if not datetime in resp.url:
- raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+ if datetime not in resp.url:
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
if cdx_sha1hex:
# verify that body matches CDX hash
# TODO: don't need *all* these hashes, just sha1
file_meta = gen_file_metadata(resp.content)
- if cdx_sha1hex != file_meta['sha1hex']:
- print(" REPLAY MISMATCH: cdx:{} replay:{}".format(
- cdx_sha1hex,
- file_meta['sha1hex']),
- file=sys.stderr)
- raise WaybackContentError("replay fetch body didn't match CDX hash cdx:{} body:{}".format(
- cdx_sha1hex,
- file_meta['sha1hex']),
+ if cdx_sha1hex != file_meta["sha1hex"]:
+ print(
+ " REPLAY MISMATCH: cdx:{} replay:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
+ file=sys.stderr,
+ )
+ raise WaybackContentError(
+ "replay fetch body didn't match CDX hash cdx:{} body:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
)
return resp.content
- def fetch_replay_redirect(self, url, datetime):
+ def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]:
"""
Fetches an HTTP 3xx redirect Location from wayback via the replay interface
(web.archive.org) instead of petabox.
@@ -557,41 +699,65 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = requests.get(
+ # when fetching via `id_`, it is possible to get a 5xx error which
+ # is either a wayback error, or an actual replay of an upstream 5xx
+ # error. the exception control flow here is tweaked, and a
+ # different HTTP session is used, to try and differentiate between
+ # the two cases
+ resp = None
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
+ resp.raise_for_status()
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
except UnicodeDecodeError:
- raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
- try:
- resp.raise_for_status()
+ raise WaybackContentError(
+ "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+ url
+ )
+ )
except Exception as e:
+ if resp is not None and "X-Archive-Src" in resp.headers:
+ raise WaybackContentError(
+ f"expected redirect record but got captured HTTP status: {resp.status_code}"
+ )
raise WaybackError(str(e))
- #print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
# previously check for "X-Archive-Redirect-Reason" here
- if not "X-Archive-Src" in resp.headers:
+ if (
+ "X-Archive-Src" not in resp.headers
+ and "X-Archive-Redirect-Reason" not in resp.headers
+ ):
raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
- if not datetime in resp.url:
- raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+ if datetime not in resp.url:
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
redirect_url = resp.headers.get("Location")
# eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw
- #print(redirect_url, file=sys.stderr)
+ # print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
redirect_url = "/".join(redirect_url.split("/")[5:])
- #print(redirect_url, file=sys.stderr)
+ # print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("http"):
redirect_url = clean_url(redirect_url)
return redirect_url
else:
return None
- def lookup_resource(self, start_url, best_mimetype=None, closest=None):
+ def lookup_resource(
+ self,
+ start_url: str,
+ best_mimetype: Optional[str] = None,
+ closest: Union[str, datetime.datetime, None] = None,
+ ) -> ResourceResult:
"""
Looks in wayback for a resource starting at the URL, following any
redirects. Returns a ResourceResult object, which may indicate a
@@ -617,11 +783,13 @@ class WaybackClient:
"""
next_url = start_url
urls_seen = [start_url]
- for i in range(self.max_redirects):
+ for i in range(self.max_redirects + 1):
print(" URL: {}".format(next_url), file=sys.stderr)
- cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype, closest=closest)
- #print(cdx_row, file=sys.stderr)
- if not cdx_row:
+ next_row: Optional[CdxRow] = self.cdx_client.lookup_best(
+ next_url, best_mimetype=best_mimetype, closest=closest
+ )
+ # print(next_row, file=sys.stderr)
+ if not next_row:
return ResourceResult(
start_url=start_url,
hit=False,
@@ -634,8 +802,10 @@ class WaybackClient:
revisit_cdx=None,
)
+ cdx_row: CdxRow = next_row
+
# first try straight-forward redirect situation
- if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path:
+ if cdx_row.mimetype == "warc/revisit" and "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -648,15 +818,17 @@ class WaybackClient:
status="success",
terminal_url=cdx_row.url,
terminal_dt=cdx_row.datetime,
- terminal_status_code=resource.revisit_cdx.status_code, # ?
+ terminal_status_code=resource.revisit_cdx.status_code,
body=resource.body,
cdx=cdx_row,
revisit_cdx=resource.revisit_cdx,
)
+ # else, continue processing with revisit record
if cdx_row.status_code in (200, 226):
revisit_cdx = None
- if '/' in cdx_row.warc_path:
+ final_cdx: Union[CdxRow, CdxPartial] = cdx_row
+ if "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -669,7 +841,7 @@ class WaybackClient:
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- cdx_row = cdx_partial_from_row(cdx_row)
+ final_cdx = cdx_partial_from_row(cdx_row)
return ResourceResult(
start_url=start_url,
hit=True,
@@ -678,11 +850,11 @@ class WaybackClient:
terminal_dt=cdx_row.datetime,
terminal_status_code=cdx_row.status_code,
body=body,
- cdx=cdx_row,
+ cdx=final_cdx,
revisit_cdx=revisit_cdx,
)
elif 300 <= (cdx_row.status_code or 0) < 400:
- if '/' in cdx_row.warc_path:
+ if "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -703,21 +875,22 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
- if not "://" in resource.location:
+ if "://" not in resource.location:
next_url = urllib.parse.urljoin(next_url, resource.location)
else:
next_url = resource.location
if next_url:
next_url = clean_url(next_url)
else:
- next_url = self.fetch_replay_redirect(
+ redirect_url = self.fetch_replay_redirect(
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- if next_url:
- next_url = clean_url(next_url)
- cdx_row = cdx_partial_from_row(cdx_row)
- if not next_url:
+ if redirect_url:
+ redirect_url = clean_url(redirect_url)
+ if redirect_url:
+ next_url = redirect_url
+ else:
print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,
@@ -756,6 +929,7 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
+
return ResourceResult(
start_url=start_url,
hit=False,
@@ -772,39 +946,72 @@ class WaybackClient:
class SavePageNowError(Exception):
pass
+
class SavePageNowBackoffError(SandcrawlerBackoffError):
pass
-SavePageNowResult = namedtuple('SavePageNowResult', [
- 'success',
- 'status',
- 'job_id',
- 'request_url',
- 'terminal_url',
- 'terminal_dt',
- 'resources',
-])
-class SavePageNowClient:
+SavePageNowResult = namedtuple(
+ "SavePageNowResult",
+ [
+ "success",
+ "status",
+ "job_id",
+ "request_url",
+ "terminal_url",
+ "terminal_dt",
+ "resources",
+ ],
+)
- def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs):
- self.ia_access_key = kwargs.get('ia_access_key',
- os.environ.get('IA_ACCESS_KEY'))
- self.ia_secret_key = kwargs.get('ia_secret_key',
- os.environ.get('IA_SECRET_KEY'))
+
+class SavePageNowClient:
+ def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs):
+ self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY"))
+ self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY"))
self.v2endpoint = v2endpoint
- self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
- self.v2_session.headers.update({
- 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient',
- 'Accept': 'application/json',
- 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key),
- })
+ self.v2_session = requests_retry_session(
+ retries=5, backoff_factor=3, status_forcelist=[502, 504]
+ )
+ self.v2_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient",
+ "Accept": "application/json",
+ "Authorization": "LOW {}:{}".format(self.ia_access_key, self.ia_secret_key),
+ }
+ )
# 3 minutes total
self.poll_count = 60
self.poll_seconds = 3.0
- def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0):
+ self.spn_cdx_retry_sec = kwargs.get("spn_cdx_retry_sec", 9.0)
+
+ # these are special-case web domains for which we want SPN2 to not run
+ # a headless browser (brozzler), but instead simply run wget.
+ # the motivation could be to work around browser issues, or in the
+ # future possibly to increase download efficiency (wget/fetch being
+ # faster than browser fetch)
+ self.simple_get_domains = [
+ # direct PDF links
+ "://arxiv.org/pdf/",
+ "://europepmc.org/backend/ptpmcrender.fcgi",
+ "://pdfs.semanticscholar.org/",
+ "://res.mdpi.com/",
+ # platform sites
+ "://zenodo.org/",
+ "://figshare.org/",
+ "://springernature.figshare.com/",
+ # popular simple cloud storage or direct links
+ "://s3-eu-west-1.amazonaws.com/",
+ ]
+
+ def save_url_now_v2(
+ self,
+ request_url: str,
+ force_simple_get: Optional[int] = None,
+ capture_outlinks: int = 0,
+ ) -> SavePageNowResult:
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -838,86 +1045,163 @@ class SavePageNowClient:
None,
None,
)
- resp = self.v2_session.post(
- self.v2endpoint,
- data={
- 'url': request_url,
- 'capture_all': 1,
- 'capture_outlinks': capture_outlinks,
- 'capture_screenshot': 0,
- 'if_not_archived_within': '1d',
- 'force_get': force_simple_get,
- 'skip_first_archive': 1,
- 'outlinks_availability': 0,
- 'js_behavior_timeout': 0,
- },
- )
+ if force_simple_get is None:
+ force_simple_get = 0
+ for domain in self.simple_get_domains:
+ if domain in request_url:
+ force_simple_get = 1
+ break
+
+ # check if SPNv2 user has capacity available
+ resp = self.v2_session.get(f"{self.v2endpoint}/status/user")
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError(
+ f"SPNv2 availability API status_code: {resp.status_code}"
+ )
+ elif resp.status_code != 200:
+ raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}")
+ resp.raise_for_status()
+ status_user = resp.json()
+ if status_user["available"] <= 1:
+ print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr)
+ raise SavePageNowBackoffError(
+ "SPNv2 availability: {}, url: {}".format(status_user, request_url)
+ )
+
+ req_data = {
+ "url": request_url,
+ "capture_all": 1,
+ "if_not_archived_within": "1d",
+ "skip_first_archive": 1,
+ "js_behavior_timeout": 0,
+ # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API
+ # implementation
+ # "capture_screenshot": 0,
+ # "outlinks_availability": 0,
+ }
+ if force_simple_get:
+ req_data["force_get"] = force_simple_get
+ if capture_outlinks:
+ req_data["capture_outlinks"] = capture_outlinks
+ try:
+ resp = self.v2_session.post(
+ self.v2endpoint,
+ data=req_data,
+ )
+ except requests.exceptions.ConnectionError:
+ raise SavePageNowError(f"SPN2 TCP connection error {request_url=}")
+
if resp.status_code == 429:
- raise SavePageNowBackoffError("status_code: {}, url: {}".format(resp.status_code, request_url))
+ raise SavePageNowBackoffError(
+ "status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
elif resp.status_code != 200:
- raise SavePageNowError("SPN2 status_code: {}, url: {}".format(resp.status_code, request_url))
+ raise SavePageNowError(
+ "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
+ resp.raise_for_status()
resp_json = resp.json()
- if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json['message']:
- raise SavePageNowBackoffError(resp_json['message'])
- elif not resp_json or 'job_id' not in resp_json:
+ if (
+ resp_json
+ and "message" in resp_json
+ and "You have already reached the limit of active sessions" in resp_json["message"]
+ ):
+ raise SavePageNowBackoffError(resp_json["message"])
+ elif (
+ resp_json
+ and "message" in resp_json
+ and "The same snapshot had been made" in resp_json["message"]
+ ):
+ return SavePageNowResult(
+ False,
+ "spn2-recent-capture",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif resp_json.get("status") == "error":
+ return SavePageNowResult(
+ False,
+ resp_json.get("status_ext") or resp_json["status"],
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]:
raise SavePageNowError(
- "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json))
+ "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)
+ )
- job_id = resp_json['job_id']
+ job_id = resp_json["job_id"]
print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
+ time.sleep(0.1)
# poll until complete
final_json = None
for i in range(self.poll_count):
- resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, resp_json['job_id']))
+ resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, job_id))
try:
resp.raise_for_status()
- except:
+ except Exception:
raise SavePageNowError(resp.content)
- status = resp.json()['status']
- if status == 'pending':
+ status = resp.json()["status"]
+ if status == "pending":
time.sleep(self.poll_seconds)
- elif status in ('success', 'error'):
+ elif status in ("success", "error"):
final_json = resp.json()
break
else:
- raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(status, request_url))
+ raise SavePageNowError(
+ "Unknown SPN2 status:{} url:{}".format(status, request_url)
+ )
if not final_json:
raise SavePageNowError("SPN2 timed out (polling count exceeded)")
# if there was a recent crawl of same URL, fetch the status of that
# crawl to get correct datetime
- if final_json.get('original_job_id'):
- print(f" SPN recent capture: {job_id} -> {final_json['original_job_id']}", file=sys.stderr)
- resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, final_json['original_job_id']))
+ if final_json.get("original_job_id"):
+ print(
+ f" SPN recent capture: {job_id} -> {final_json['original_job_id']}",
+ file=sys.stderr,
+ )
+ resp = self.v2_session.get(
+ "{}/status/{}".format(self.v2endpoint, final_json["original_job_id"])
+ )
try:
resp.raise_for_status()
- except:
+ except Exception:
raise SavePageNowError(resp.content)
final_json = resp.json()
- #print(final_json, file=sys.stderr)
+ # print(final_json, file=sys.stderr)
- if final_json['status'] == "success":
- if final_json.get('original_url').startswith('/'):
- print(f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}", file=sys.stderr)
+ if final_json["status"] == "success":
+ if final_json.get("original_url").startswith("/"):
+ print(
+ f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}",
+ file=sys.stderr,
+ )
return SavePageNowResult(
True,
"success",
job_id,
request_url,
- final_json['original_url'],
- final_json['timestamp'],
- final_json['resources'],
+ final_json["original_url"],
+ final_json["timestamp"],
+ final_json.get("resources") or None,
)
else:
- if final_json['status'] == 'pending':
- final_json['status'] = 'error:pending'
+ if final_json["status"] == "pending":
+ final_json["status"] = "error:pending"
return SavePageNowResult(
False,
- final_json.get('status_ext') or final_json['status'],
+ final_json.get("status_ext") or final_json["status"],
job_id,
request_url,
None,
@@ -925,24 +1209,38 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
+ def crawl_resource(
+ self,
+ start_url: str,
+ wayback_client: WaybackClient,
+ force_simple_get: Optional[int] = None,
+ ) -> ResourceResult:
"""
- Runs a SPN2 crawl, then fetches body from wayback.
+ Runs a SPN2 crawl, then fetches body.
- TODO: possible to fetch from petabox?
+ There is a delay between SPN2 crawls and WARC upload to petabox, so we
+ need to fetch the body via wayback replay instead of petabox
+ range-request.
"""
# HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
- if 'gzbd.cnki.net/' in start_url:
- spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get, capture_outlinks=1)
+ if "gzbd.cnki.net/" in start_url:
+ spn_result = self.save_url_now_v2(
+ start_url, force_simple_get=force_simple_get, capture_outlinks=1
+ )
else:
spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get)
if not spn_result.success:
status = spn_result.status
- if status in ("error:invalid-url", "error:not-found",
- "error:invalid-host-resolution", "error:gateway-timeout",
- "error:too-many-redirects", "error:read-timeout"):
+ if status in (
+ "error:invalid-url",
+ "error:not-found",
+ "error:invalid-host-resolution",
+ "error:gateway-timeout",
+ "error:too-many-redirects",
+ "error:read-timeout",
+ ):
status = status.replace("error:", "")
elif status in ("error:no-access", "error:forbidden"):
status = "forbidden"
@@ -953,7 +1251,10 @@ class SavePageNowClient:
elif status.startswith("error:"):
status = "spn2-" + status
# despite other errors, call these a failure (so we don't retry)
- if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1")):
+ if spn_result.terminal_url and (
+ spn_result.terminal_url.endswith("/cookieAbsent")
+ or spn_result.terminal_url.endswith("cookieSet=1")
+ ):
status = "blocked-cookie"
return ResourceResult(
start_url=start_url,
@@ -966,10 +1267,10 @@ class SavePageNowClient:
cdx=None,
revisit_cdx=None,
)
- #print(spn_result, file=sys.stderr)
+ # print(spn_result, file=sys.stderr)
# detect partial URL response (aka, success, but missing full URL)
- if not "://" in spn_result.terminal_url or spn_result.terminal_url.startswith('/'):
+ if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith("/"):
return ResourceResult(
start_url=start_url,
hit=False,
@@ -983,7 +1284,9 @@ class SavePageNowClient:
)
# don't try to CDX fetch for this common cookie block terminal
- if spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"):
+ if spn_result.terminal_url.endswith(
+ "/cookieAbsent"
+ ) or spn_result.terminal_url.endswith("cookieSet=1"):
return ResourceResult(
start_url=start_url,
hit=False,
@@ -996,7 +1299,7 @@ class SavePageNowClient:
revisit_cdx=None,
)
- cdx_row = None
+ cdx_row: Optional[CdxRow] = None
# hack to work around elsevier weirdness
if "://pdf.sciencedirectassets.com/" in spn_result.request_url:
elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best(
@@ -1008,7 +1311,7 @@ class SavePageNowClient:
cdx_row = elsevier_pdf_cdx
else:
print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
- #print(elsevier_pdf_cdx, file=sys.stderr)
+ # print(elsevier_pdf_cdx, file=sys.stderr)
if not cdx_row:
# lookup exact
@@ -1020,8 +1323,17 @@ class SavePageNowClient:
url=spn_result.terminal_url,
datetime=spn_result.terminal_dt,
filter_status_code=filter_status_code,
- retry_sleep=9.0,
+ retry_sleep=self.spn_cdx_retry_sec,
)
+ # sometimes there are fuzzy http/https self-redirects with the
+ # same SURT; try to work around that
+ if cdx_row.status_code >= 300 and cdx_row.status_code < 400:
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=200,
+ retry_sleep=self.spn_cdx_retry_sec,
+ )
except KeyError as ke:
print(" CDX KeyError: {}".format(ke), file=sys.stderr)
return ResourceResult(
@@ -1036,10 +1348,11 @@ class SavePageNowClient:
revisit_cdx=None,
)
- #print(cdx_row, file=sys.stderr)
+ # print(cdx_row, file=sys.stderr)
revisit_cdx = None
- if '/' in cdx_row.warc_path:
+ final_cdx: Union[CdxRow, CdxPartial] = cdx_row
+ if "/" in cdx_row.warc_path:
# Usually can't do this kind of direct fetch because CDX result is recent/live
resource = wayback_client.fetch_petabox(
csize=cdx_row.warc_csize,
@@ -1057,7 +1370,7 @@ class SavePageNowClient:
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- except (WaybackError, WaybackContentError) as we:
+ except (WaybackError, WaybackContentError):
return ResourceResult(
start_url=start_url,
hit=False,
@@ -1070,24 +1383,48 @@ class SavePageNowClient:
revisit_cdx=None,
)
# warc_path etc will change, so strip them out
- cdx_row = cdx_partial_from_row(cdx_row)
+ final_cdx = cdx_partial_from_row(cdx_row)
- return ResourceResult(
- start_url=start_url,
- hit=True,
- status="success",
- terminal_url=cdx_row.url,
- terminal_dt=cdx_row.datetime,
- terminal_status_code=cdx_row.status_code,
- body=body,
- cdx=cdx_row,
- revisit_cdx=revisit_cdx,
- )
+ assert cdx_row.status_code
+ if cdx_row.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
-def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
- if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
- print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+def fix_transfer_encoding(
+ file_meta: dict, resource: ResourceResult
+) -> Tuple[dict, ResourceResult]:
+ if (
+ resource.body
+ and file_meta["mimetype"] == "application/gzip"
+ and resource.cdx
+ and resource.cdx.mimetype != "application/gzip"
+ ):
+ print(
+ " transfer encoding not stripped: {}".format(resource.cdx.mimetype),
+ file=sys.stderr,
+ )
inner_body = gzip.decompress(resource.body)
if not inner_body:
raise Exception("null body inside transfer encoding")
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
deleted file mode 100644
index abcc156..0000000
--- a/python/sandcrawler/ingest.py
+++ /dev/null
@@ -1,754 +0,0 @@
-
-import sys
-import json
-import gzip
-import time
-import base64
-import xml.etree.ElementTree
-from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
-from http.server import BaseHTTPRequestHandler, HTTPServer
-
-import requests
-from selectolax.parser import HTMLParser
-
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
-from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import process_pdf, PdfExtractResult
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_ingest import fetch_html_resources, \
- quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
- WebResource, html_guess_platform
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-from sandcrawler.workers import SandcrawlerWorker
-from sandcrawler.db import SandcrawlerPostgrestClient
-from sandcrawler.xml import xml_reserialize
-
-
-class IngestFileWorker(SandcrawlerWorker):
- """
- High level flow is to look in history first, then go to live web if
- resource not found. Following redirects is treated as "fetching a
- resource". Current version fetches a single resource; if it isn't a hit
- but is an HTML 200, treats it as a landing page, tries to extract
- fulltext link, then fetches that resource.
-
- process(request, key=None) -> response
- Does all the things!
-
- Check existing processing (short circuit):
-
- check_existing_ingest(base_url) -> ingest_file_result or none
- process_existing(result) -> response
- try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit()
-
- Fetch resource:
-
- find_resource(url) -> ResourceResult
-
- Process resource:
-
- process_hit(ResourceResult) -> response
- process_grobid(ResourceResult)
- """
-
- def __init__(self, sink=None, **kwargs):
- super().__init__()
-
- self.sink = sink
- self.wayback_client = kwargs.get('wayback_client')
- if not self.wayback_client:
- self.wayback_client = WaybackClient()
- self.spn_client = kwargs.get('spn_client')
- if not self.spn_client:
- self.spn_client = SavePageNowClient()
- self.grobid_client = kwargs.get('grobid_client')
- if not self.grobid_client:
- self.grobid_client = GrobidClient()
- self.pgrest_client = kwargs.get('pgrest_client')
- if not self.pgrest_client:
- self.pgrest_client = SandcrawlerPostgrestClient()
- self.grobid_sink = kwargs.get('grobid_sink')
- self.thumbnail_sink = kwargs.get('thumbnail_sink')
- self.pdftext_sink = kwargs.get('pdftext_sink')
- self.xmldoc_sink = kwargs.get('xmldoc_sink')
- self.htmlteixml_sink = kwargs.get('htmlteixml_sink')
- self.max_hops = 6
-
- self.try_existing_ingest = kwargs.get('try_existing_ingest', False)
- self.try_existing_grobid = kwargs.get('try_existing_grobid', True)
- self.try_existing_pdfextract = kwargs.get('try_existing_pdfextract', True)
- self.try_wayback = kwargs.get('try_wayback', True)
- self.try_spn2 = kwargs.get('try_spn2', True)
- self.html_quick_mode = kwargs.get('html_quick_mode', False)
- self.adblock_rules = load_adblock_rules()
- self.max_html_resources = 200
-
- self.base_url_blocklist = [
- # robot blocking
- "://hkvalidate.perfdrive.com/",
-
- # temporary, until we implement specific fetch and 'petabox' output
- "://archive.org/",
- "://www.archive.org/",
- "://web.archive.org/web/",
- "://openlibrary.org/",
- "://www.openlibrary.org/",
- "://fatcat.wiki/",
-
- # Domain squats
- "://bartandjones.com",
- "://ijretm.com",
- "://ijrcemas.com",
- "://jist.net.in",
- "://croisements-revue.org",
-
- # all stubs/previews, not full papers
- "://page-one.live.cf.public.springer.com",
-
- # large datasets-only (no PDF expected)
- "plutof.ut.ee/",
- "www.gbif.org/",
- "doi.pangaea.de/",
- "www.plate-archive.org/",
- "://doi.org/10.25642/ipk/gbis/",
- "://apex.ipk-gatersleben.de/",
-
- # Historical non-paper content:
- "dhz.uni-passau.de/", # newspapers
- "digital.ucd.ie/", # ireland national historical
-
- # DOI prefixes
- "://doi.org/10.2307/", # JSTOR; slow and many redirects
- ]
-
- self.wall_blocklist = [
- # loginwall
- "://profile.thieme.de/HTML/sso/ejournals/login.htm",
- "://login.bepress.com/"
- "?SAMLRequest="
- ]
-
- # these are special-case web domains for which we want SPN2 to not run
- # a headless browser (brozzler), but instead simply run wget.
- # the motivation could be to work around browser issues, or in the
- # future possibly to increase download efficiency (wget/fetch being
- # faster than browser fetch)
- self.spn2_simple_get_domains = [
- # direct PDF links
- "://arxiv.org/pdf/",
- "://europepmc.org/backend/ptpmcrender.fcgi",
- "://pdfs.semanticscholar.org/",
- "://res.mdpi.com/",
-
- # platform sites
- "://zenodo.org/",
- "://figshare.org/",
- "://springernature.figshare.com/",
-
- # popular simple cloud storage or direct links
- "://s3-eu-west-1.amazonaws.com/",
- ]
-
-
- def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
- """
- Check in sandcrawler-db (postgres) to see if we have already ingested
- this URL (ingest file result table).
-
- Returns existing row *if* found *and* we should use it, otherwise None.
-
- Looks at existing ingest results and makes a decision based on, eg,
- status and timestamp.
- """
- if not self.try_existing_ingest:
- return None
- existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
- # TODO: filter on more flags?
- if existing and existing['hit'] == True:
- return existing
- else:
- return None
-
- def find_resource(self, url, best_mimetype=None, force_recrawl=False) -> Optional[ResourceResult]:
- """
- Looks in wayback for a resource starting at the URL, following any
- redirects. If a hit isn't found, try crawling with SPN.
- """
- via = "none"
- resource = None
-
- if url.startswith("http://web.archive.org/web/") or url.startswith("https://web.archive.org/web/"):
- raise NotImplementedError("handling direct wayback links not supported yet")
-
- if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
- raise NotImplementedError("fetching from archive.org not implemented yet")
-
- if self.try_wayback and not force_recrawl:
- via = "wayback"
- resource = self.wayback_client.lookup_resource(url, best_mimetype)
-
- # check for "soft 404" conditions, where we should retry with live SPNv2
- soft404 = False
- # NOTE: these are often not working with SPNv2 either, so disabling. If
- # we really want to try again, should do force-recrawl
- #if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
- # soft404 = True
-
- old_failure = False
- if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000':
- old_failure = True
-
- if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure):
- via = "spn2"
- force_simple_get = 0
- for domain in self.spn2_simple_get_domains:
- if domain in url:
- force_simple_get = 1
- break
- resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get)
- print("[FETCH {:>6}] {} {}".format(
- via,
- (resource and resource.status),
- (resource and resource.terminal_url) or url),
- file=sys.stderr)
- return resource
-
- def process_existing(self, request: dict, result_row: dict) -> dict:
- """
- If we have an existing ingest file result, do any database fetches or
- additional processing necessary to return a result.
- """
- raise NotImplementedError("process_existing() not tested or safe yet")
- assert result_row['hit']
- existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
- existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
- existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt'])
- if not (existing_file_meta and existing_grobid and existing_cdx):
- raise NotImplementedError("partially-exsiting records not implemented yet")
- result = {
- 'hit': result_row['hit'],
- 'status': "existing",
- 'request': request,
- 'grobid': existing_grobid,
- 'file_meta': existing_file_meta,
- 'cdx': existing_cdx,
- 'terminal': {
- 'terminal_url': result_row['terminal_url'],
- 'terminal_dt': result_row['terminal_dt'],
- 'terminal_status_code': result_row['terminal_status_code'],
- 'terminal_sha1hex': result_row['terminal_sha1hex'],
- },
- }
- return result
-
- def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
- """
- Run all the necessary processing for a new/fresh ingest hit.
- """
- if ingest_type == "pdf":
- return {
- 'grobid': self.process_grobid(resource, file_meta),
- 'pdf_meta': self.process_pdfextract(resource, file_meta),
- }
- elif ingest_type == "xml":
- return {
- 'xml_meta': self.process_xml(resource, file_meta),
- }
- elif ingest_type == "html":
- html_info = self.process_html(resource, file_meta)
- # if there is no html_biblio, don't clobber anything possibly extracted earlier
- if 'html_biblio' in html_info and not html_info['html_biblio']:
- html_info.pop('html_biblio')
- return html_info
- else:
- raise NotImplementedError(f"process {ingest_type} hit")
-
- def process_grobid(self, resource: ResourceResult, file_meta: dict) -> dict:
- """
- Submits to resource body to GROBID for processing.
-
- TODO: By default checks sandcrawler-db for an existing row first, then
- decide if we should re-process
- """
- if self.try_existing_grobid:
- existing = self.pgrest_client.get_grobid(file_meta['sha1hex'])
- if existing:
- print("found existing GROBID result", file=sys.stderr)
- return existing
-
- # Need to actually processes
- result = self.grobid_client.process_fulltext(resource.body)
- if self.grobid_sink:
- # extra fields for GROBID kafka messages
- result['file_meta'] = file_meta
- result['key'] = result['file_meta']['sha1hex']
- self.grobid_sink.push_record(result.copy())
- if result['status'] == "success":
- metadata = self.grobid_client.metadata(result)
- if metadata:
- result['metadata'] = self.grobid_client.metadata(result)
- result['fatcat_release'] = result['metadata'].pop('fatcat_release', None)
- result['grobid_version'] = result['metadata'].pop('grobid_version', None)
- result.pop('tei_xml', None)
- result.pop('file_meta', None)
- result.pop('key', None)
- return result
-
- def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict:
- """
- Extracts thumbnail and pdf_meta info from PDF.
-
- By default checks sandcrawler-db for an existing row first, then decide
- if we should re-process.
-
- TODO: difference between Kafka schema and SQL/postgrest schema
- """
- if self.try_existing_pdfextract:
- existing = self.pgrest_client.get_pdf_meta(file_meta['sha1hex'])
- if existing:
- print("found existing pdf_meta result", file=sys.stderr)
- result = PdfExtractResult.from_pdf_meta_dict(existing)
- return result.to_pdftext_dict()
-
- # Need to actually processes
- result = process_pdf(resource.body)
- assert result.file_meta['sha1hex'] == file_meta['sha1hex']
- if self.thumbnail_sink and result.page0_thumbnail is not None:
- self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
- if self.pdftext_sink:
- self.pdftext_sink.push_record(result.to_pdftext_dict(), key=result.sha1hex)
- result.page0_thumbnail = None
- result.text = None
- result.file_meta = None
- return result.to_pdftext_dict()
-
- def process_xml(self, resource: ResourceResult, file_meta: dict) -> dict:
- """
- Simply publishes to Kafka topic.
-
- In the future, could extract other metadata here (like body word
- count), or attempting to fetch sub-resources.
- """
- if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml":
- try:
- jats_xml = xml_reserialize(resource.body)
- except xml.etree.ElementTree.ParseError:
- return dict(status="xml-parse-error")
- msg = dict(
- sha1hex=file_meta["sha1hex"],
- status="success",
- jats_xml=jats_xml,
- )
- self.xmldoc_sink.push_record(msg, key=file_meta['sha1hex'])
- return dict(status="success")
-
- def process_html(self, resource: ResourceResult, file_meta: dict) -> dict:
-
- assert resource.body
- try:
- html_doc = HTMLParser(resource.body)
- except ValueError as ve:
- return dict(
- status="html-selectolax-error",
- )
- html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
- assert html_biblio
- html_body = html_extract_body_teixml(resource.body)
- html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
- html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count'))
- html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
-
- if html_scope in ('blocked-captcha','blocked-cookie','blocked-forbidden'):
- return dict(
- status=html_scope,
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- )
- elif html_scope == 'unknown':
- html_body.pop("tei_xml", None)
- return dict(
- status="unknown-scope",
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
- elif html_scope not in ('article-fulltext',):
- html_body.pop("tei_xml", None)
- return dict(
- status="wrong-scope",
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
-
- raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules)
- if len(raw_resources) > self.max_html_resources:
- html_body.pop("tei_xml", None)
- return dict(
- status="too-many-resources",
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
-
- if self.htmlteixml_sink and html_body['status'] == "success":
- self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex'])
-
- html_body.pop("tei_xml", None)
-
- partial_result = dict(
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
-
- when = parse_cdx_datetime(resource.cdx.datetime)
- full_resources: List[WebResource] = []
-
- try:
- if self.html_quick_mode:
- print(" WARN: running quick CDX-only fetches", file=sys.stderr)
- full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when)
- else:
- full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
- except PetaboxError as e:
- partial_result['status'] = 'petabox-error'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
- except CdxApiError as e:
- partial_result['status'] = 'cdx-error'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
- except WaybackError as e:
- partial_result['status'] = 'wayback-error'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
- except WaybackContentError as e:
- partial_result['status'] = 'wayback-content-error'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
- except NoCaptureError as e:
- partial_result['status'] = 'html-resource-no-capture'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
-
- return dict(
- html_body=html_body,
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
- )
-
- def timeout_response(self, task: dict) -> dict:
- print("[TIMEOUT]", file=sys.stderr)
- return dict(
- request=task,
- hit=False,
- status="timeout",
- error_message="ingest worker internal timeout",
- )
-
- def want(self, request: dict) -> bool:
- if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html'):
- return False
- return True
-
- def process(self, request: dict, key: Any = None) -> dict:
-
- # old backwards compatibility
- if request.get('ingest_type') == 'file':
- request['ingest_type'] = 'pdf'
-
- ingest_type = request.get('ingest_type')
- if ingest_type not in ("pdf", "xml", "html"):
- raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
-
- # parse/clean URL
- # note that we pass through the original/raw URL, and that is what gets
- # persisted in database table
- base_url = clean_url(request['base_url'])
-
- force_recrawl = bool(request.get('force_recrawl', False))
-
- for block in self.base_url_blocklist:
- if block in base_url:
- print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
- return dict(request=request, hit=False, status="skip-url-blocklist")
-
- print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
-
- best_mimetype = None
- if ingest_type == "pdf":
- best_mimetype = "application/pdf"
- elif ingest_type == "xml":
- best_mimetype = "text/xml"
- elif ingest_type == "html":
- best_mimetype = "text/html"
-
- existing = self.check_existing_ingest(ingest_type, base_url)
- if existing:
- return self.process_existing(request, existing)
-
- result: Dict[str, Any] = dict(request=request, hit=False)
-
- next_url = base_url
- hops = [base_url]
-
- while len(hops) <= self.max_hops:
-
- result['hops'] = hops
-
- # check against blocklist again on each hop
- for block in self.base_url_blocklist:
- if block in next_url:
- result['status'] = "skip-url-blocklist"
- return result
-
- # check against known loginwall URLs
- for block in self.wall_blocklist:
- if block in next_url:
- result['status'] = "skip-wall"
- return result
-
- # check for popular cookie blocking URL patterns. On successful SPN
- # crawls, shouldn't see these redirect URLs
- if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url:
- result['status'] = 'blocked-cookie'
- return result
-
- try:
- resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
- except SavePageNowError as e:
- result['status'] = 'spn2-error'
- result['error_message'] = str(e)[:1600]
- return result
- except PetaboxError as e:
- result['status'] = 'petabox-error'
- result['error_message'] = str(e)[:1600]
- return result
- except CdxApiError as e:
- result['status'] = 'cdx-error'
- result['error_message'] = str(e)[:1600]
- # add a sleep in cdx-error path as a slow-down
- time.sleep(2.0)
- return result
- except WaybackError as e:
- result['status'] = 'wayback-error'
- result['error_message'] = str(e)[:1600]
- return result
- except WaybackContentError as e:
- result['status'] = 'wayback-content-error'
- result['error_message'] = str(e)[:1600]
- return result
- except NotImplementedError as e:
- result['status'] = 'not-implemented'
- result['error_message'] = str(e)[:1600]
- return result
-
- assert resource
-
- if resource.terminal_url:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- }
- if resource.terminal_url not in result['hops']:
- result['hops'].append(resource.terminal_url)
-
- if not resource.hit:
- result['status'] = resource.status
- return result
-
- if resource.terminal_url and ('/cookieAbsent' in next_url or 'cookieSet=1' in resource.terminal_url):
- result['status'] = 'blocked-cookie'
- return result
-
- if not resource.body:
- result['status'] = 'null-body'
- return result
-
- file_meta = gen_file_metadata(resource.body)
- try:
- file_meta, resource = fix_transfer_encoding(file_meta, resource)
- except Exception as e:
- result['status'] = 'bad-gzip-encoding'
- result['error_message'] = str(e)
- return result
-
- if not resource.body or file_meta['size_bytes'] == 0:
- result['status'] = 'null-body'
- return result
-
- # here we split based on ingest type to try and extract a next hop
- html_ish_resource = bool(
- "html" in file_meta['mimetype']
- or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml"
- or "application/xml" in file_meta['mimetype']
- or "text/xml" in file_meta['mimetype']
- )
- html_biblio = None
- html_doc = None
- if html_ish_resource and resource.body:
- try:
- html_doc = HTMLParser(resource.body)
- html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
- if html_biblio:
- if not 'html_biblio' in result or html_biblio.title:
- result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
- #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
- except ValueError:
- pass
-
- if ingest_type == "pdf" and html_ish_resource:
-
- # the new style of URL extraction (already computed)
- if html_biblio and html_biblio.pdf_fulltext_url:
- fulltext_url = dict(
- pdf_url=html_biblio.pdf_fulltext_url,
- technique="html_biblio",
- )
- else:
- fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
- result['extract_next_hop'] = fulltext_url
- if not fulltext_url:
- result['status'] = 'no-pdf-link'
- return result
- next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') or ""
- assert next_url
- next_url = clean_url(next_url)
- print("[PARSE {:>6}] {} {}".format(
- ingest_type,
- fulltext_url.get('technique'),
- next_url,
- ),
- file=sys.stderr)
- if next_url in hops:
- result['status'] = 'link-loop'
- result['error_message'] = "repeated: {}".format(next_url)
- return result
- hops.append(next_url)
- continue
- elif ingest_type == "xml" and html_ish_resource:
- if html_biblio and html_biblio.xml_fulltext_url:
- next_url = html_biblio.xml_fulltext_url
- technique = "html_biblio"
- print("[PARSE {:>6}] {} {}".format(
- ingest_type,
- technique,
- next_url,
- ),
- file=sys.stderr)
- if next_url in hops:
- result['status'] = 'link-loop'
- result['error_message'] = "repeated: {}".format(next_url)
- return result
- hops.append(next_url)
- continue
- elif ingest_type == "html" and html_ish_resource:
- if html_biblio and html_biblio.html_fulltext_url:
- next_url = html_biblio.html_fulltext_url
- technique = "html_biblio"
- if next_url in hops:
- # for HTML ingest, we don't count this as a link-loop
- break
- print("[PARSE {:>6}] {} {}".format(
- ingest_type,
- technique,
- next_url,
- ),
- file=sys.stderr)
- hops.append(next_url)
- continue
-
- # default is to NOT keep hopping
- break
-
- if len(hops) >= self.max_hops:
- result['status'] = "max-hops-exceeded"
- return result
-
- # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
- assert resource
- assert resource.hit == True
- assert resource.terminal_status_code in (200, 226)
-
- if resource.terminal_url:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- "terminal_sha1hex": file_meta['sha1hex'],
- }
-
- result['file_meta'] = file_meta
- result['cdx'] = cdx_to_dict(resource.cdx)
- if resource.revisit_cdx:
- result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx)
-
- if ingest_type == "pdf":
- if file_meta['mimetype'] != "application/pdf":
- result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
- return result
- elif ingest_type == "xml":
- if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"):
- result['status'] = "wrong-mimetype"
- return result
- elif ingest_type == "html":
- if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
- result['status'] = "wrong-mimetype"
- return result
- else:
- raise NotImplementedError()
-
- info = self.process_hit(ingest_type, resource, file_meta)
- result.update(info)
-
- # check if processing turned up an error
- if info.get('status') not in ('success', None):
- result['status'] = info['status']
- return result
-
- result['status'] = "success"
- result['hit'] = True
- if ingest_type == "pdf":
- print("[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
- ingest_type,
- result.get('file_meta', {}).get('sha1hex'),
- result.get('grobid', {}).get('status_code'),
- result.get('pdf_meta', {}).get('status'),
- ),
- file=sys.stderr)
- else:
- print("[SUCCESS {:>5}] sha1:{}".format(
- ingest_type,
- result.get('file_meta', {}).get('sha1hex'),
- ),
- file=sys.stderr)
- return result
-
-
-class IngestFileRequestHandler(BaseHTTPRequestHandler):
- def do_POST(self):
- if self.path != "/ingest":
- self.send_response(404)
- self.end_headers()
- self.wfile.write("404: Not Found")
- return
- length = int(self.headers.get('content-length'))
- request = json.loads(self.rfile.read(length).decode('utf-8'))
- print("Got request: {}".format(request))
- ingester = IngestFileWorker()
- result = ingester.process(request)
- self.send_response(200)
- self.end_headers()
- self.wfile.write(json.dumps(result))
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
new file mode 100644
index 0000000..03277f8
--- /dev/null
+++ b/python/sandcrawler/ingest_file.py
@@ -0,0 +1,925 @@
+import json
+import sys
+import time
+import xml.etree.ElementTree
+from http.server import BaseHTTPRequestHandler
+from typing import Any, Dict, List, Optional
+
+from selectolax.parser import HTMLParser
+
+from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.grobid import GrobidClient
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import (
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiError,
+ NoCaptureError,
+ PetaboxError,
+ ResourceResult,
+ SavePageNowBackoffError,
+ SavePageNowClient,
+ SavePageNowError,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.ingest_html import (
+ WebResource,
+ fetch_html_resources,
+ html_extract_body_teixml,
+ html_guess_platform,
+ html_guess_scope,
+ quick_fetch_html_resources,
+)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
+from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.xml import xml_reserialize
+
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
+
+class IngestFileWorker(SandcrawlerWorker):
+ """
+ High level flow is to look in history first, then go to live web if
+ resource not found. Following redirects is treated as "fetching a
+ resource". Current version fetches a single resource; if it isn't a hit
+ but is an HTML 200, treats it as a landing page, tries to extract
+ fulltext link, then fetches that resource.
+
+ process(request, key=None) -> response
+ Does all the things!
+
+ Check existing processing (short circuit):
+
+ check_existing_ingest(base_url) -> ingest_file_result or none
+ process_existing(result) -> response
+ try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit()
+
+ Fetch resource:
+
+ find_resource(url) -> ResourceResult
+
+ Process resource:
+
+ process_file_hit(ResourceResult) -> response
+ process_grobid(ResourceResult)
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__()
+
+ self.sink = sink
+
+ if kwargs.get("wayback_client"):
+ self.wayback_client: WaybackClient = kwargs["wayback_client"]
+ else:
+ self.wayback_client = WaybackClient()
+
+ if kwargs.get("spn_client"):
+ self.spn_client: SavePageNowClient = kwargs["spn_client"]
+ else:
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
+
+ if kwargs.get("grobid_client"):
+ self.grobid_client: GrobidClient = kwargs["grobid_client"]
+ else:
+ self.grobid_client = GrobidClient()
+
+ if kwargs.get("pgrest_client"):
+ self.pgrest_client: SandcrawlerPostgrestClient = kwargs["pgrest_client"]
+ else:
+ self.pgrest_client = SandcrawlerPostgrestClient()
+
+ self.grobid_sink = kwargs.get("grobid_sink")
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
+ self.pdftext_sink = kwargs.get("pdftext_sink")
+ self.xmldoc_sink = kwargs.get("xmldoc_sink")
+ self.htmlteixml_sink = kwargs.get("htmlteixml_sink")
+ self.max_hops = 8
+
+ self.try_existing_ingest = kwargs.get("try_existing_ingest", False)
+ self.try_existing_grobid = kwargs.get("try_existing_grobid", True)
+ self.try_existing_pdfextract = kwargs.get("try_existing_pdfextract", True)
+ self.try_wayback = kwargs.get("try_wayback", True)
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.html_quick_mode = kwargs.get("html_quick_mode", False)
+ self.adblock_rules = load_adblock_rules()
+ self.max_html_resources = 200
+
+ self.base_url_blocklist = [
+ "://localhost/",
+ "://127.0.0.1/",
+ # robot blocking / rate-limited
+ "://hkvalidate.perfdrive.com/",
+ "://ieeexplore.ieee.org/",
+ # temporary, until we implement specific fetch and 'petabox' output
+ "://archive.org/",
+ "://www.archive.org/",
+ "://web.archive.org/web/",
+ # out of scope
+ "://openlibrary.org/",
+ "://www.openlibrary.org/",
+ "://fatcat.wiki/",
+ "://scholar.archive.org/",
+ "://orcid.org/",
+ # Domain squats
+ "://bartandjones.com",
+ "://ijretm.com",
+ "://ijrcemas.com",
+ "://jist.net.in",
+ "://croisements-revue.org",
+ # all stubs/previews, not full papers
+ "://page-one.live.cf.public.springer.com",
+ # large datasets-only (no PDF expected)
+ "plutof.ut.ee/",
+ "www.gbif.org/",
+ "doi.pangaea.de/",
+ "www.plate-archive.org/",
+ "://doi.org/10.25642/ipk/gbis/",
+ "://apex.ipk-gatersleben.de/",
+ "fao.org/glis/",
+ # Historical non-paper content:
+ "dhz.uni-passau.de/", # newspapers
+ "digital.ucd.ie/", # ireland national historical
+ # DOI prefixes
+ "doi.org/10.2307/", # JSTOR; slow and many redirects
+ "doi.org/10.18730/", # fao.org: database entry
+ "doi.org/10.15468/", # gbif.org: database entry
+ "doi.org/10.48550/", # arxiv.org: redundant with direct ingest
+ # deprecated domain (doesn't redirect correctly)
+ "://edoc.mpg.de/",
+ # bogus/spam PDFs
+ "://isiarticles.com/",
+ ]
+
+ self.wall_blocklist = [
+ # loginwall
+ "://profile.thieme.de/HTML/sso/ejournals/login.htm",
+ "://login.bepress.com/",
+ "?SAMLRequest=",
+ "://osapublishing.org/captcha/",
+ "/password-login",
+ "://gateway.isiknowledge.com/",
+ "/login?TARGET=",
+ "jstage.jst.go.jp/sblogin",
+ "://acw.elsevier.com/SSOCore",
+ "://acw.sciencedirect.com/SSOCore",
+ "/login?source=",
+ ]
+
+ self.cookie_blocklist = [
+ "/cookieAbsent",
+ "cookieSet=1",
+ "error=cookies_not_supported",
+ # SPNv2 seems to work (not end up here), but heritrix fails
+ "://secure.jbs.elsevierhealth.com/",
+ ]
+
+ self.src_valid_mimetypes = [
+ "text/x-tex",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip",
+ "application/x-tar",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ]
+
+ self.component_valid_mimetypes = [
+ "image/jpeg",
+ "image/tiff",
+ "image/png",
+ "image/gif",
+ "audio/mpeg",
+ "video/mp4",
+ "video/mpeg",
+ "text/plain",
+ "text/csv",
+ "text/x-r-source", # dataverse
+ "text/tab-separated-values", # dataverse
+ "text/x-rst", # dataverse
+ "application/x-rlang-transport", # dataverse
+ "application/json",
+ "application/xml",
+ "application/pdf",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip ",
+ "application/x-rar ",
+ "application/x-7z-compressed",
+ "application/x-tar",
+ "application/vnd.ms-powerpoint",
+ "application/vnd.ms-excel",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ]
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Check in sandcrawler-db (postgres) to see if we have already ingested
+ this URL (ingest file result table).
+
+ Returns existing row *if* found *and* we should use it, otherwise None.
+
+ Looks at existing ingest results and makes a decision based on, eg,
+ status and timestamp.
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing["hit"] is True:
+ return existing
+ else:
+ return None
+
+ def find_resource(
+ self, url: str, best_mimetype: Optional[str] = None, force_recrawl: bool = False
+ ) -> Optional[ResourceResult]:
+ """
+ Looks in wayback for a resource starting at the URL, following any
+ redirects. If a hit isn't found, try crawling with SPN.
+ """
+ via = "none"
+ resource = None
+
+ if url.startswith("http://web.archive.org/web/") or url.startswith(
+ "https://web.archive.org/web/"
+ ):
+ raise NotImplementedError("handling direct wayback links not supported yet")
+
+ if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
+ raise NotImplementedError("fetching from archive.org not implemented yet")
+
+ if self.try_wayback and not force_recrawl:
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(url, best_mimetype)
+
+ # check for "soft 404" conditions, where we should retry with live SPNv2
+ soft404 = False
+ # NOTE: these are often not working with SPNv2 either, so disabling. If
+ # we really want to try again, should do force-recrawl
+ # if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
+ # soft404 = True
+
+ old_failure = False
+ if (
+ resource
+ and not resource.hit
+ and resource.terminal_dt
+ and resource.terminal_dt < "20190000000000"
+ ):
+ old_failure = True
+
+ if self.try_spn2 and (
+ resource is None
+ or (resource and resource.status == "no-capture")
+ or soft404
+ or old_failure
+ ):
+ via = "spn2"
+ resource = self.spn_client.crawl_resource(url, self.wayback_client)
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via, (resource and resource.status), (resource and resource.terminal_url) or url
+ ),
+ file=sys.stderr,
+ )
+ return resource
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest file result, do any database fetches or
+ additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+ assert result_row["hit"]
+ existing_file_meta = self.pgrest_client.get_file_meta(result_row["terminal_sha1hex"])
+ existing_grobid = self.pgrest_client.get_grobid(result_row["terminal_sha1hex"])
+ existing_cdx = self.pgrest_client.get_cdx(
+ result_row["terminal_url"], result_row["terminal_dt"]
+ )
+ if not (existing_file_meta and existing_grobid and existing_cdx):
+ raise NotImplementedError("partially-exsiting records not implemented yet")
+ result = {
+ "hit": result_row["hit"],
+ "status": "existing",
+ "request": request,
+ "grobid": existing_grobid,
+ "file_meta": existing_file_meta,
+ "cdx": existing_cdx,
+ "terminal": {
+ "terminal_url": result_row["terminal_url"],
+ "terminal_dt": result_row["terminal_dt"],
+ "terminal_status_code": result_row["terminal_status_code"],
+ "terminal_sha1hex": result_row["terminal_sha1hex"],
+ },
+ }
+ return result
+
+ def process_file_hit(
+ self, ingest_type: str, resource: ResourceResult, file_meta: dict
+ ) -> dict:
+ """
+ Run all the necessary processing for a new/fresh ingest hit.
+ """
+ if (
+ ingest_type in ["dataset-file", "component"]
+ and file_meta["mimetype"] == "application/pdf"
+ ):
+ ingest_type = "pdf"
+ if ingest_type == "pdf":
+ return {
+ "grobid": self.process_grobid(resource, file_meta),
+ "pdf_meta": self.process_pdfextract(resource, file_meta),
+ }
+ elif ingest_type == "xml":
+ return {
+ "xml_meta": self.process_xml(resource, file_meta),
+ }
+ elif ingest_type == "html":
+ html_info = self.process_html(resource, file_meta)
+ # if there is no html_biblio, don't clobber anything possibly extracted earlier
+ if "html_biblio" in html_info and not html_info["html_biblio"]:
+ html_info.pop("html_biblio")
+ return html_info
+ elif ingest_type == "src":
+ return {}
+ elif ingest_type == "component":
+ return {}
+ elif ingest_type == "dataset-file":
+ return {}
+ else:
+ raise NotImplementedError(f"process {ingest_type} hit")
+
+ def process_grobid(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Submits to resource body to GROBID for processing.
+
+ TODO: By default checks sandcrawler-db for an existing row first, then
+ decide if we should re-process
+ """
+ if self.try_existing_grobid:
+ existing = self.pgrest_client.get_grobid(file_meta["sha1hex"])
+ if existing:
+ # grobid_timestamp = existing.get("grobid_timestamp") or None
+ # status
+ grobid_version = existing.get("grobid_version") or None
+ if grobid_version and grobid_version.startswith("0.7"):
+ print("found existing GROBID result", file=sys.stderr)
+ return existing
+
+ # Need to actually processes
+ result = self.grobid_client.process_fulltext(resource.body)
+ if self.grobid_sink:
+ # extra fields for GROBID kafka messages
+ result["file_meta"] = file_meta
+ result["key"] = result["file_meta"]["sha1hex"]
+ self.grobid_sink.push_record(result.copy())
+ if result["status"] == "success":
+ metadata = self.grobid_client.metadata(result)
+ if metadata:
+ result["metadata"] = metadata
+ result["fatcat_release"] = metadata.pop("fatcat_release", None)
+ result["grobid_version"] = metadata.pop("grobid_version", None)
+ result.pop("tei_xml", None)
+ result.pop("file_meta", None)
+ result.pop("key", None)
+ return result
+
+ def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Extracts thumbnail and pdf_meta info from PDF.
+
+ By default checks sandcrawler-db for an existing row first, then decide
+ if we should re-process.
+
+ TODO: difference between Kafka schema and SQL/postgrest schema
+ """
+ if self.try_existing_pdfextract:
+ existing = self.pgrest_client.get_pdf_meta(file_meta["sha1hex"])
+ if existing:
+ print("found existing pdf_meta result", file=sys.stderr)
+ result = PdfExtractResult.from_pdf_meta_dict(existing)
+ return result.to_pdftext_dict()
+
+ # Need to actually processes
+ result = process_pdf(resource.body)
+ assert result.sha1hex == file_meta["sha1hex"]
+ assert result.file_meta is not None
+ assert result.file_meta["sha1hex"] == file_meta["sha1hex"]
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+ if self.pdftext_sink:
+ self.pdftext_sink.push_record(result.to_pdftext_dict(), key=result.sha1hex)
+ result.page0_thumbnail = None
+ result.text = None
+ result.file_meta = None
+ return result.to_pdftext_dict()
+
+ def process_xml(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Simply publishes to Kafka topic.
+
+ In the future, could extract other metadata here (like body word
+ count), or attempting to fetch sub-resources.
+ """
+ if self.xmldoc_sink and file_meta["mimetype"] == "application/jats+xml":
+ try:
+ jats_xml = xml_reserialize(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="xml-parse-error")
+ msg = dict(
+ sha1hex=file_meta["sha1hex"],
+ status="success",
+ jats_xml=jats_xml,
+ )
+ self.xmldoc_sink.push_record(msg, key=file_meta["sha1hex"])
+ return dict(status="success")
+
+ def process_html(self, resource: ResourceResult, file_meta: dict) -> dict:
+
+ assert resource.body
+ try:
+ html_doc = HTMLParser(resource.body)
+ except ValueError:
+ return dict(status="html-selectolax-error")
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ assert html_biblio
+ try:
+ html_body = html_extract_body_teixml(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="html-teixml-error")
+ html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
+ html_scope = html_guess_scope(
+ resource.terminal_url, html_doc, html_biblio, html_body.get("word_count")
+ )
+ html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
+
+ if html_scope in ("blocked-captcha", "blocked-cookie", "blocked-forbidden"):
+ return dict(
+ status=html_scope,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ )
+ elif html_scope not in (
+ "article-fulltext",
+ "unknown",
+ ):
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="wrong-scope",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ raw_resources = html_extract_resources(
+ resource.terminal_url, html_doc, self.adblock_rules
+ )
+ if len(raw_resources) > self.max_html_resources:
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="too-many-resources",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ if self.htmlteixml_sink and html_body["status"] == "success":
+ self.htmlteixml_sink.push_record(html_body, key=file_meta["sha1hex"])
+
+ html_body.pop("tei_xml", None)
+
+ partial_result = dict(
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ when = parse_cdx_datetime(resource.cdx.datetime)
+ full_resources: List[WebResource] = []
+
+ try:
+ if self.html_quick_mode:
+ print(" WARN: running quick CDX-only fetches", file=sys.stderr)
+ full_resources = quick_fetch_html_resources(
+ raw_resources, self.wayback_client.cdx_client, when
+ )
+ else:
+ full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+ except PetaboxError as e:
+ partial_result["status"] = "petabox-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except CdxApiError as e:
+ partial_result["status"] = "cdx-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except WaybackError as e:
+ partial_result["status"] = "wayback-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except WaybackContentError as e:
+ partial_result["status"] = "wayback-content-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except NoCaptureError as e:
+ partial_result["status"] = "html-resource-no-capture"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+
+ info = dict(
+ html_body=html_body,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
+ )
+ if html_scope == "unknown":
+ info["status"] = "unknown-scope"
+ return info
+
+ def timeout_response(self, task: dict) -> dict:
+ print("[TIMEOUT]", file=sys.stderr)
+ return dict(
+ request=task,
+ hit=False,
+ status="timeout",
+ error_message="ingest worker internal timeout",
+ )
+
+ def want(self, request: dict) -> bool:
+ if not request.get("ingest_type") in ("file", "pdf", "xml", "html", "src", "component"):
+ return False
+ return True
+
+ def process(self, request: dict, key: Any = None) -> dict:
+ return self.process_file(request, key=key)
+
+ def process_file(self, request: dict, key: Any = None) -> dict:
+
+ # old backwards compatibility
+ if request.get("ingest_type") == "file":
+ request["ingest_type"] = "pdf"
+
+ ingest_type = request.get("ingest_type")
+ if ingest_type not in ("pdf", "xml", "html", "src", "component"):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request["base_url"])
+
+ force_recrawl = bool(request.get("force_recrawl", False))
+
+ for block in self.base_url_blocklist:
+ if block in base_url:
+ print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+ return dict(request=request, hit=False, status="skip-url-blocklist")
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ best_mimetype = None
+ if ingest_type == "pdf":
+ best_mimetype = "application/pdf"
+ elif ingest_type == "xml":
+ best_mimetype = "text/xml"
+ elif ingest_type == "html":
+ best_mimetype = "text/html"
+ elif ingest_type == "src":
+ best_mimetype = "application/gzip"
+
+ existing = self.check_existing_ingest(ingest_type, base_url)
+ if existing:
+ return self.process_existing(request, existing)
+
+ result: Dict[str, Any] = dict(request=request, hit=False)
+
+ next_url = base_url
+ hops = [base_url]
+
+ while len(hops) <= self.max_hops:
+
+ result["hops"] = hops
+
+ # check against blocklist again on each hop
+ for block in self.base_url_blocklist:
+ if block in next_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ # also check against known loginwall patterns
+ for block in self.wall_blocklist:
+ if block in next_url:
+ # TODO: blocked-wall instead of skip-wall
+ result["status"] = "skip-wall"
+ return result
+
+ # check for popular cookie blocking URL patterns. On successful SPN
+ # crawls, shouldn't see these redirect URLs
+ for pattern in self.cookie_blocklist:
+ if pattern in next_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ try:
+ resource = self.find_resource(
+ next_url, best_mimetype, force_recrawl=force_recrawl
+ )
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except SavePageNowBackoffError as e:
+ result["status"] = "spn2-backoff"
+ result["error_message"] = str(e)[:1600]
+ # small sleep as a slow-down
+ time.sleep(2.0)
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ assert resource
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result["hops"]:
+ result["hops"].append(resource.terminal_url)
+
+ if not resource.hit:
+ result["status"] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ if not resource.body:
+ result["status"] = "empty-blob"
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result["status"] = "body-too-large"
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result["status"] = "bad-gzip-encoding"
+ result["error_message"] = str(e)
+ return result
+
+ if not resource.body or file_meta["size_bytes"] == 0:
+ result["status"] = "empty-blob"
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta["mimetype"]
+ or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta["mimetype"]
+ or "text/xml" in file_meta["mimetype"]
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if "html_biblio" not in result and html_biblio.title:
+ result["html_biblio"] = json.loads(
+ html_biblio.json(exclude_none=True)
+ )
+ # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ if ingest_type == "pdf" and html_ish_resource:
+
+ # the new style of URL extraction (already computed)
+ if html_biblio and html_biblio.pdf_fulltext_url:
+ fulltext_url = dict(
+ pdf_url=html_biblio.pdf_fulltext_url,
+ technique="html_biblio",
+ )
+ else:
+ fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
+
+ result["extract_next_hop"] = fulltext_url
+ if not fulltext_url:
+ # check if we hit a paywall/loginwall
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+ # else, just failed to find link
+ result["status"] = "no-pdf-link"
+ return result
+ next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or ""
+ assert next_url
+ next_url = clean_url(next_url)
+ print(
+ "[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ fulltext_url.get("technique"),
+ next_url,
+ ),
+ file=sys.stderr,
+ )
+ if next_url in hops:
+ result["status"] = "link-loop"
+ result["error_message"] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+ elif (
+ ingest_type in ("xml", "html", "component")
+ and html_ish_resource
+ and html_biblio
+ ):
+ # NOTE: src_fulltext_url is not a thing
+ next_url_found = None
+ if ingest_type == "xml" and html_biblio.xml_fulltext_url:
+ next_url_found = html_biblio.xml_fulltext_url
+ elif ingest_type == "html" and html_biblio.html_fulltext_url:
+ next_url_found = html_biblio.html_fulltext_url
+ elif ingest_type == "component" and html_biblio.component_url:
+ next_url_found = html_biblio.component_url
+
+ if next_url_found:
+ next_url = next_url_found
+ technique = "html_biblio"
+ print(
+ "[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ technique,
+ next_url,
+ ),
+ file=sys.stderr,
+ )
+ if next_url in hops:
+ if ingest_type == "html":
+ # for HTML ingest, we don't count this as a link-loop
+ break
+ result["status"] = "link-loop"
+ result["error_message"] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+
+ # default is to NOT keep hopping
+ break
+
+ if len(hops) >= self.max_hops:
+ result["status"] = "max-hops-exceeded"
+ return result
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit is True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta["sha1hex"],
+ }
+
+ result["file_meta"] = file_meta
+ result["cdx"] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+
+ # check if we hit a paywall/loginwall before trying mimetype
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+
+ if ingest_type == "pdf":
+ if file_meta["mimetype"] != "application/pdf":
+ result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta["mimetype"] not in (
+ "application/xml",
+ "text/xml",
+ "application/jats+xml",
+ ):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "src":
+ if file_meta["mimetype"] not in self.src_valid_mimetypes:
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "component":
+ if file_meta["mimetype"] not in self.component_valid_mimetypes:
+ result["status"] = "wrong-mimetype"
+ return result
+ else:
+ raise NotImplementedError()
+
+ info = self.process_file_hit(ingest_type, resource, file_meta)
+ result.update(info)
+
+ # check if processing turned up an error
+ if info.get("status") not in ("success", None):
+ result["status"] = info["status"]
+ return result
+
+ result["status"] = "success"
+ result["hit"] = True
+ if ingest_type == "pdf":
+ print(
+ "[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
+ ingest_type,
+ result.get("file_meta", {}).get("sha1hex"),
+ result.get("grobid", {}).get("status_code"),
+ result.get("pdf_meta", {}).get("status"),
+ ),
+ file=sys.stderr,
+ )
+ else:
+ print(
+ "[SUCCESS {:>5}] sha1:{}".format(
+ ingest_type,
+ result.get("file_meta", {}).get("sha1hex"),
+ ),
+ file=sys.stderr,
+ )
+ return result
+
+
+class IngestFileRequestHandler(BaseHTTPRequestHandler):
+ def do_POST(self) -> None:
+ if self.path != "/ingest":
+ self.send_response(404)
+ self.end_headers()
+ self.wfile.write(b"404: Not Found")
+ return
+ length = int(self.headers.get("content-length"))
+ request = json.loads(self.rfile.read(length).decode("utf-8"))
+ print("Got request: {}".format(request))
+ ingester = IngestFileWorker()
+ result = ingester.process(request)
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(json.dumps(result).encode("utf8"))
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
new file mode 100644
index 0000000..3acbece
--- /dev/null
+++ b/python/sandcrawler/ingest_fileset.py
@@ -0,0 +1,516 @@
+import json
+import sys
+import time
+from typing import Any, Dict, Optional
+
+import requests
+from selectolax.parser import HTMLParser
+
+from sandcrawler.fileset_platforms import (
+ ArchiveOrgHelper,
+ DataverseHelper,
+ FigshareHelper,
+ ZenodoHelper,
+)
+from sandcrawler.fileset_strategies import (
+ ArchiveorgFilesetStrategy,
+ ArchiveorgFileStrategy,
+ WebFilesetStrategy,
+ WebFileStrategy,
+)
+from sandcrawler.fileset_types import (
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
+from sandcrawler.html_metadata import html_extract_biblio
+from sandcrawler.ia import (
+ CdxApiError,
+ PetaboxError,
+ SavePageNowError,
+ WaybackContentError,
+ WaybackError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.ingest_file import IngestFileWorker
+from sandcrawler.misc import clean_url, gen_file_metadata
+from sandcrawler.workers import SandcrawlerWorker
+
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
+
+class IngestFilesetWorker(IngestFileWorker):
+ """
+ General process is:
+
+ 1. crawl base_url, and use request and landing page resource (eg, HTML) to
+ determine platform being targeted
+ 2. use platform-specific helper to fetch metadata about the work, including
+ a manifest of files, and selection of an "ingest strategy" and any
+ required context
+ 3. then use strategy-specific helper to archive files from manifest (first
+ checking to see if content has been archived already)
+ 4. summarize status
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__(sink=None, **kwargs)
+
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.sink = sink
+ self.dataset_platform_helpers = {
+ "dataverse": DataverseHelper(),
+ "figshare": FigshareHelper(),
+ "zenodo": ZenodoHelper(),
+ "archiveorg": ArchiveOrgHelper(),
+ }
+ self.dataset_strategy_archivers = {
+ IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
+ IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
+ IngestStrategy.WebFileset: WebFilesetStrategy(try_spn2=self.try_spn2),
+ IngestStrategy.WebFile: WebFileStrategy(try_spn2=self.try_spn2),
+ }
+
+ self.max_total_size = kwargs.get("max_total_size", 64 * 1024 * 1024 * 1024)
+ self.max_file_count = kwargs.get("max_file_count", 200)
+ self.ingest_file_result_sink = kwargs.get("ingest_file_result_sink")
+ self.ingest_file_result_stdout = kwargs.get("ingest_file_result_stdout", False)
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Same as file version, but uses fileset result table
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_fileset_platform(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing["hit"] is True:
+ return existing
+ else:
+ return None
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest fileset result, do any database fetches
+ or additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+
+ def want(self, request: dict) -> bool:
+ if not request.get("ingest_type") in ("dataset",):
+ return False
+ return True
+
+ def fetch_resource_iteratively(
+ self, ingest_type: str, base_url: str, force_recrawl: bool
+ ) -> dict:
+ """
+ This is copypasta from process_file(), should probably refactor.
+ """
+
+ result: Dict[str, Any] = dict(hit=False)
+ result["hops"] = [base_url]
+ next_url = base_url
+
+ # check against blocklist
+ for block in self.base_url_blocklist:
+ # NOTE: hack to not skip archive.org content
+ if "archive.org" in block:
+ continue
+ if block in next_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ try:
+ resource = self.find_resource(next_url, force_recrawl=force_recrawl)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ html_biblio = None
+ if resource:
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result["hops"]:
+ result["hops"].append(resource.terminal_url)
+
+ if not resource.hit:
+ result["status"] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ if not resource.body:
+ result["status"] = "empty-blob"
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result["status"] = "body-too-large"
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result["status"] = "bad-gzip-encoding"
+ result["error_message"] = str(e)
+ return result
+
+ if not resource.body or file_meta["size_bytes"] == 0:
+ result["status"] = "empty-blob"
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta["mimetype"]
+ or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta["mimetype"]
+ or "text/xml" in file_meta["mimetype"]
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if "html_biblio" not in result and html_biblio.title:
+ result["html_biblio"] = json.loads(
+ html_biblio.json(exclude_none=True)
+ )
+ # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit is True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta["sha1hex"],
+ }
+
+ result["file_meta"] = file_meta
+ result["cdx"] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+
+ if ingest_type == "pdf":
+ if file_meta["mimetype"] != "application/pdf":
+ result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta["mimetype"] not in (
+ "application/xml",
+ "text/xml",
+ "application/jats+xml",
+ ):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ result["status"] = "wrong-mimetype"
+ return result
+ else:
+ # eg, datasets, components, etc
+ pass
+
+ result["_html_biblio"] = html_biblio
+ result["_resource"] = resource
+ return result
+
+ def process(self, request: dict, key: Any = None) -> dict:
+
+ ingest_type = request.get("ingest_type")
+ if ingest_type not in ("dataset",):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request["base_url"])
+
+ force_recrawl = bool(request.get("force_recrawl", False))
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ # TODO: "existing" check against file and/or fileset ingest result table
+ # existing = self.check_existing_ingest(ingest_type, base_url)
+ # if existing:
+ # return self.process_existing(request, existing)
+
+ result = self.fetch_resource_iteratively(
+ ingest_type, base_url, force_recrawl=force_recrawl
+ )
+ result["request"] = request
+ if result.get("status") is not None:
+ result["request"] = request
+ return result
+
+ html_biblio = result.pop("_html_biblio")
+ resource = result.pop("_resource")
+
+ # 1. Determine `platform`, which may involve resolving redirects and crawling a landing page.
+
+ # TODO: could involve html_guess_platform() here?
+
+ # determine platform
+ platform_helper = None
+ for (helper_name, helper) in self.dataset_platform_helpers.items():
+ if helper.match_request(request, resource, html_biblio):
+ platform_helper = helper
+ break
+
+ if not platform_helper:
+ result["status"] = "no-platform-match"
+ return result
+
+ # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
+ try:
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ except PlatformScopeError as e:
+ result["status"] = "platform-scope"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PlatformRestrictedError as e:
+ result["status"] = "platform-restricted"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except requests.exceptions.HTTPError as e:
+ result["error_message"] = str(e)[:1600]
+ if e.response.status_code == 404:
+ result["status"] = "platform-404"
+ result["error_message"] = str(e)[:1600]
+ return result
+ else:
+ result["status"] = "platform-http-error"
+ return result
+ except requests.exceptions.RequestException as e:
+ result["error_message"] = str(e)[:1600]
+ result["status"] = "platform-error"
+ return result
+
+ # print(dataset_meta, file=sys.stderr)
+ platform = dataset_meta.platform_name
+ result["platform_name"] = dataset_meta.platform_name
+ result["platform_domain"] = dataset_meta.platform_domain
+ result["platform_id"] = dataset_meta.platform_id
+ result["platform_base_url"] = dataset_meta.web_base_url
+ result["archiveorg_item_name"] = dataset_meta.archiveorg_item_name
+
+ if not dataset_meta.manifest:
+ result["status"] = "empty-manifest"
+ return result
+
+ # these will get confirmed/updated after ingest
+ result["manifest"] = [m.dict(exclude_none=True) for m in dataset_meta.manifest]
+ result["file_count"] = len(dataset_meta.manifest)
+ result["total_size"] = sum([m.size for m in dataset_meta.manifest if m.size])
+
+ if result["total_size"] > self.max_total_size:
+ result["status"] = "too-large-size"
+ return result
+ if result["file_count"] > self.max_file_count:
+ # hard max, to prevent downstream breakage
+ if result["file_count"] > 10 * 1000:
+ result["manifest"] = result["manifest"][: self.max_file_count]
+ result["status"] = "too-many-files"
+ return result
+
+ ingest_strategy = platform_helper.chose_strategy(dataset_meta)
+ result["ingest_strategy"] = ingest_strategy
+ print(
+ f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}",
+ file=sys.stderr,
+ )
+
+ strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy)
+ if not strategy_helper:
+ result["status"] = "no-strategy-helper"
+ return result
+
+ # 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
+ try:
+ archive_result = strategy_helper.process(dataset_meta)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ # 4. Summarize status and return structured result metadata.
+ result["status"] = archive_result.status
+ result["manifest"] = [m.dict(exclude_none=True) for m in archive_result.manifest]
+
+ if ingest_strategy.endswith("-fileset-bundle"):
+ result["fileset_bundle"] = dict()
+ if archive_result.bundle_file_meta:
+ result["fileset_bundle"]["file_meta"] = archive_result.bundle_file_meta
+ if archive_result.bundle_archiveorg_path:
+ result["fileset_bundle"][
+ "archiveorg_bundle_path"
+ ] = archive_result.bundle_archiveorg_path
+ if archive_result.bundle_resource:
+ result["fileset_bundle"]["terminal"] = dict(
+ terminal_url=archive_result.bundle_resource.terminal_url,
+ terminal_dt=archive_result.bundle_resource.terminal_dt,
+ terminal_status_code=archive_result.bundle_resource.terminal_status_code,
+ )
+ if archive_result.bundle_resource.cdx:
+ result["fileset_bundle"]["cdx"] = cdx_to_dict(
+ archive_result.bundle_resource.cdx
+ )
+ if archive_result.bundle_resource.revisit_cdx:
+ result["fileset_bundle"]["revisit_cdx"] = cdx_to_dict(
+ archive_result.bundle_resource.revisit_cdx
+ )
+
+ if ingest_strategy.endswith("-file"):
+ result["fileset_file"] = dict()
+ if archive_result.file_file_meta:
+ result["fileset_file"]["file_meta"] = (archive_result.file_file_meta,)
+ if archive_result.file_resource:
+ result["fileset_file"]["terminal"] = dict(
+ terminal_url=archive_result.file_resource.terminal_url,
+ terminal_dt=archive_result.file_resource.terminal_dt,
+ terminal_status_code=archive_result.file_resource.terminal_status_code,
+ )
+ if archive_result.file_resource.cdx:
+ result["fileset_file"]["cdx"] = cdx_to_dict(
+ archive_result.file_resource.cdx
+ )
+ if archive_result.file_resource.revisit_cdx:
+ result["fileset_file"]["revisit_cdx"] = cdx_to_dict(
+ archive_result.file_resource.revisit_cdx
+ )
+
+ if result["status"].startswith("success"):
+ # check that these are still valid
+ assert result["file_count"] == len(archive_result.manifest)
+ assert result["total_size"] == sum(
+ [m.size for m in archive_result.manifest if m.size]
+ )
+
+ if (
+ result["status"] == "success-file"
+ and archive_result.file_resource
+ and archive_result.file_file_meta
+ ):
+ file_result: Dict[str, Any] = dict(
+ hit=True,
+ status="success",
+ request=request.copy(),
+ file_meta=archive_result.file_file_meta,
+ terminal=dict(
+ terminal_url=archive_result.file_resource.terminal_url,
+ terminal_dt=archive_result.file_resource.terminal_dt,
+ terminal_status_code=archive_result.file_resource.terminal_status_code,
+ terminal_sha1hex=archive_result.file_file_meta["sha1hex"],
+ ),
+ )
+ if archive_result.file_resource.cdx:
+ file_result["cdx"] = cdx_to_dict(archive_result.file_resource.cdx)
+ if archive_result.file_resource.revisit_cdx:
+ file_result["revisit_cdx"] = cdx_to_dict(
+ archive_result.file_resource.revisit_cdx
+ )
+ file_result["request"]["ingest_type"] = request["ingest_type"] + "-file"
+ # call the super() (ingest_file) version of process_hit()
+ info = self.process_file_hit(
+ file_result["request"]["ingest_type"],
+ archive_result.file_resource,
+ archive_result.file_file_meta,
+ )
+ file_result.update(info)
+ if self.ingest_file_result_sink:
+ self.ingest_file_result_sink.push_record(result.copy())
+ elif self.ingest_file_result_stdout:
+ sys.stdout.write(json.dumps(file_result, sort_keys=True) + "\n")
+
+ if result["status"].startswith("success"):
+ result["hit"] = True
+ print(
+ "[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format(
+ ingest_type,
+ result["file_count"],
+ result["total_size"],
+ ingest_strategy,
+ ),
+ file=sys.stderr,
+ )
+ else:
+ print(
+ "[FAIL {:>5}] status={} file_count={} total_size={} strategy={}".format(
+ ingest_type,
+ result["status"],
+ result["file_count"],
+ result["total_size"],
+ ingest_strategy,
+ ),
+ file=sys.stderr,
+ )
+ return result
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/ingest_html.py
index 91b9cd6..fb42e71 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,27 +1,43 @@
-
-import io
-import sys
-import json
-import datetime
import argparse
+import datetime
+import json
+import sys
import xml.etree.ElementTree as ET
-from typing import List, Optional, Any, Tuple
+from typing import Any, List, Optional, Tuple
-import trafilatura
import pydantic
+import trafilatura
from selectolax.parser import HTMLParser
-from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError
-from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-
+from sandcrawler.html_metadata import (
+ BiblioMetadata,
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiClient,
+ NoCaptureError,
+ WaybackClient,
+ WaybackContentError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.misc import (
+ datetime_to_cdx,
+ gen_file_metadata,
+ parse_cdx_datetime,
+ url_fuzzy_equal,
+)
TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
+
def html_extract_body_teixml(doc: bytes) -> dict:
try:
- tei_xml = trafilatura.extract(doc,
- tei_output=True,
+ tei_xml = trafilatura.extract(
+ doc,
+ output_format="xmltei",
include_comments=False,
include_formatting=True,
)
@@ -33,22 +49,28 @@ def html_extract_body_teixml(doc: bytes) -> dict:
if tei_xml:
body_txt = teixml_body_text(tei_xml)
word_count = len(body_txt.split())
- return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count)
- elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
+ return dict(
+ status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count
+ )
+ elif doc.startswith(
+ b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
+ ):
# hack for firstmonday.org
return html_extract_body_teixml(doc[106:])
else:
return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
+
def teixml_body_text(doc_xml: str) -> str:
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
tree = ET.fromstring(doc_xml)
- body = tree.find('.//tei:body', ns)
+ body = tree.find(".//tei:body", ns)
if body:
return " ".join(body.itertext())
else:
return ""
+
class WebResource(pydantic.BaseModel):
surt: str
timestamp: datetime.datetime
@@ -61,16 +83,15 @@ class WebResource(pydantic.BaseModel):
resource_type: Optional[str]
class Config:
- json_encoders = {
- datetime.datetime: lambda dt: dt.isoformat()
- }
+ json_encoders = {datetime.datetime: lambda dt: dt.isoformat()}
+
class IngestWebResult(pydantic.BaseModel):
status: str
hit: bool
error_message: Optional[str]
cdx: Optional[dict]
- terminal: Optional[Any] # TODO
+ terminal: Optional[Any] # TODO
request: Optional[Any] # TODO
file_meta: Optional[dict]
html_biblio: Optional[BiblioMetadata]
@@ -84,6 +105,7 @@ class IngestWebResult(pydantic.BaseModel):
datetime.datetime: lambda dt: dt.isoformat(),
}
+
class HtmlMetaRow(pydantic.BaseModel):
sha1hex: str
status: str
@@ -106,7 +128,7 @@ class HtmlMetaRow(pydantic.BaseModel):
"""
return (
self.sha1hex,
- datetime.datetime.now(), # updated
+ datetime.datetime.now(), # updated
self.status,
self.scope,
self.has_teixml,
@@ -117,7 +139,9 @@ class HtmlMetaRow(pydantic.BaseModel):
)
-def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+def quick_fetch_html_resources(
+ resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
"""
This is the lazy version that just does a CDX lookup for each resource.
@@ -128,31 +152,37 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
full = []
closest = when and datetime_to_cdx(when)
for resource in resources:
- cdx_row = cdx_client.lookup_best(resource['url'], closest=closest)
+ cdx_row = cdx_client.lookup_best(resource["url"], closest=closest)
if not cdx_row:
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
- if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']):
- print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr)
+ if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]):
+ print(
+ f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr
+ )
if not cdx_row.status_code:
# TODO: fall back to a full fetch?
- print(f" WARN: skipping revisit record", file=sys.stderr)
+ print(" WARN: skipping revisit record", file=sys.stderr)
continue
- full.append(WebResource(
- surt=cdx_row.surt,
- timestamp=cdx_row.datetime,
- url=cdx_row.url,
- sha1hex=cdx_row.sha1hex,
- mimetype=cdx_row.mimetype,
- status_code=cdx_row.status_code,
- size=None,
- sha256hex=None,
- resource_type=resource['type'],
- ))
+ full.append(
+ WebResource(
+ surt=cdx_row.surt,
+ timestamp=cdx_row.datetime,
+ url=cdx_row.url,
+ sha1hex=cdx_row.sha1hex,
+ mimetype=cdx_row.mimetype,
+ status_code=cdx_row.status_code,
+ size=None,
+ sha256hex=None,
+ resource_type=resource["type"],
+ )
+ )
return full
-def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+def fetch_html_resources(
+ resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
"""
This is the full version which fetches each resource from wayback/petabox
and calculates additional hashes.
@@ -163,33 +193,50 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
full = []
closest = when and datetime_to_cdx(when)
for resource in resources:
- wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest)
- if not wayback_resp or wayback_resp.status != 'success':
+ wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest)
+ if not wayback_resp or wayback_resp.status != "success":
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
- file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
- if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
- raise WaybackContentError("wayback payload sha1hex mismatch: {wayback_resp.cdx.url}")
- full.append(WebResource(
- surt=wayback_resp.cdx.surt,
- timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
- url=wayback_resp.cdx.url,
- sha1hex=file_meta['sha1hex'],
- mimetype=file_meta['mimetype'],
- status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code,
- size=file_meta['size_bytes'],
- sha256hex=file_meta['sha256hex'],
- resource_type=resource['type'],
- ))
+ # for HTML sub-resources specifically, we allow the CDX SHA1 to match
+ # either the transfer-encoded or inner (un-encoded) payload body to
+ # match. This is because of an ambiguity in the WARC specification
+ outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
+ try:
+ file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp)
+ except Exception as e:
+ raise WaybackContentError(f"bad gzip encoding: {e}")
+ if (
+ file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ ):
+ raise WaybackContentError(
+ f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url} found:{file_meta['sha1hex']} expected:{wayback_resp.cdx.sha1hex}"
+ )
+ full.append(
+ WebResource(
+ surt=wayback_resp.cdx.surt,
+ timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
+ url=wayback_resp.cdx.url,
+ sha1hex=file_meta["sha1hex"],
+ mimetype=file_meta["mimetype"],
+ status_code=wayback_resp.cdx.status_code
+ or wayback_resp.revisit_cdx.status_code,
+ size=file_meta["size_bytes"],
+ sha256hex=file_meta["sha256hex"],
+ resource_type=resource["type"],
+ )
+ )
return full
-def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]:
+def html_guess_platform(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
+) -> Optional[str]:
generator: Optional[str] = None
generator_elem = doc.css_first("meta[name='generator']")
if generator_elem:
- generator = generator_elem.attrs['content']
+ generator = generator_elem.attrs["content"]
else:
generator_elem = doc.css_first("a[id='developedBy']")
if generator_elem:
@@ -200,12 +247,21 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
return "ojs"
elif generator and "plone" in generator.lower():
return "plone"
+ elif generator and "wordpress" in generator.lower():
+ return "wordpress"
+ elif generator and "blogger" in generator.lower():
+ return "blogger"
elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
return "ojs"
else:
try:
- if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html:
+ if (
+ 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>'
+ in doc.html
+ ):
return "ojs"
+ if '<a href="https://www.pubpub.org">Published with' in doc.html:
+ return "pubpub"
if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
return "arpha"
if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html:
@@ -214,18 +270,21 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
pass
icon_elem = doc.css_first("link[type='image/x-icon']")
- if icon_elem and 'href' in icon_elem.attrs:
- if 'journalssystem.com' in icon_elem.attrs['href']:
+ if icon_elem and "href" in icon_elem.attrs:
+ if "journalssystem.com" in icon_elem.attrs["href"]:
return "journalssystem.com"
- elif 'indexcopernicus.com' in icon_elem.attrs['href']:
+ elif "indexcopernicus.com" in icon_elem.attrs["href"]:
return "indexcopernicus"
- if 'scielo' in url:
+ if "scielo" in url:
return "scielo"
return None
-def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
+
+def html_guess_scope(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]
+) -> str:
"""
This function tries to guess if an HTML document represents one of:
@@ -236,6 +295,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
- component
- issue-fulltext
- landingpage
+ - homepage-domain
- blocked-paywall
- blocked-login
- blocked-captcha
@@ -249,6 +309,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
fulltext or a landing page, but could be one of the other categories.
"""
+ # assert that this is a real URL
+ assert url.count("/") >= 2
+
# basic paywall and loginwall detection based on URL
if url.endswith("/cookieAbsent"):
return "blocked-cookie"
@@ -264,6 +327,10 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
if "showcaptcha.asp" in url:
return "blocked-captcha"
+ # is this the top-level URL of the domain? aka, no path?
+ if url.count("/") <= 2 or (url.count("/") == 3) and url.endswith("/"):
+ return "homepage-domain"
+
platform = html_guess_platform(url, doc, biblio)
if biblio:
@@ -308,13 +375,17 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
if word_count is not None:
if word_count < 20:
return "stub"
+ elif word_count > 500 and platform in ["wordpress", "blogger"]:
+ return "article-fulltext"
elif word_count > 1200:
return "article-fulltext"
return "unknown"
-def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult:
+def run_single(
+ url: str, timestamp: Optional[str] = None, quick_mode: bool = False
+) -> IngestWebResult:
adblock = load_adblock_rules()
wayback_client = WaybackClient()
@@ -332,7 +403,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
file_meta = gen_file_metadata(html_resource.body)
file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
- if file_meta['mimetype'] not in ("text/html", "text/xml"):
+ if file_meta["mimetype"] not in ("text/html", "text/xml"):
return IngestWebResult(
status="wrong-mimetype",
hit=False,
@@ -343,8 +414,8 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
html_doc = HTMLParser(html_resource.body)
html_biblio = html_extract_biblio(url, html_doc)
html_body = html_extract_body_teixml(html_resource.body)
- html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count'))
- if html_scope not in ('article-fulltext', 'unknown'):
+ html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get("word_count"))
+ if html_scope not in ("article-fulltext", "unknown"):
return IngestWebResult(
status="wrong-scope",
hit=False,
@@ -361,7 +432,9 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
full_resources: List[WebResource] = []
if quick_mode:
- full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when)
+ full_resources = quick_fetch_html_resources(
+ raw_resources, wayback_client.cdx_client, when
+ )
else:
full_resources = fetch_html_resources(raw_resources, wayback_client, when)
@@ -382,12 +455,10 @@ def main() -> None:
"""
Run this command like:
- python -m sandcrawler.html_ingest
+ python -m sandcrawler.ingest_html
"""
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
subparsers = parser.add_subparsers()
sub = subparsers.add_parser(
@@ -419,9 +490,10 @@ def main() -> None:
result = run_single(args.url, args.timestamp, args.quick_mode)
print(result.json(indent=2, exclude_none=True))
else:
- #func = getattr(wp, args.func)
- #func()
+ # func = getattr(wp, args.func)
+ # func()
raise NotImplementedError()
+
if __name__ == "__main__":
main()
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index c7deea1..8836515 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,14 +1,18 @@
-
-import io
-import os
import hashlib
+import io
+from typing import Optional, Tuple, Union
import minio
class SandcrawlerMinioClient(object):
-
- def __init__(self, host_url, access_key, secret_key, default_bucket=None):
+ def __init__(
+ self,
+ host_url: str,
+ access_key: str,
+ secret_key: str,
+ default_bucket: Optional[str] = None,
+ ):
"""
host is minio connection string (host:port)
access and secret key are as expected
@@ -28,7 +32,7 @@ class SandcrawlerMinioClient(object):
)
self.default_bucket = default_bucket
- def _blob_path(self, folder, sha1hex: str, extension: str, prefix):
+ def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str:
if not extension:
extension = ""
if not prefix:
@@ -44,7 +48,15 @@ class SandcrawlerMinioClient(object):
)
return obj_path
- def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
+ def put_blob(
+ self,
+ folder: str,
+ blob: Union[str, bytes],
+ sha1hex: Optional[str] = None,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> Tuple[str, str]:
"""
blob should be bytes
sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
@@ -53,7 +65,7 @@ class SandcrawlerMinioClient(object):
filename is SHA1 with an optional file extension.
"""
if type(blob) == str:
- blob = blob.encode('utf-8')
+ blob = blob.encode("utf-8")
assert type(blob) == bytes
if not sha1hex:
h = hashlib.sha1()
@@ -64,13 +76,13 @@ class SandcrawlerMinioClient(object):
bucket = self.default_bucket
assert bucket
content_type = "application/octet-stream"
- if extension.endswith('.xml'):
+ if extension.endswith(".xml"):
content_type = "application/xml"
- if extension.endswith('.png'):
+ if extension.endswith(".png"):
content_type = "image/png"
- elif extension.endswith('.jpg') or extension.endswith('.jpeg'):
+ elif extension.endswith(".jpg") or extension.endswith(".jpeg"):
content_type = "image/jpeg"
- elif extension.endswith('.txt'):
+ elif extension.endswith(".txt"):
content_type = "text/plain"
self.mc.put_object(
bucket,
@@ -81,7 +93,14 @@ class SandcrawlerMinioClient(object):
)
return (bucket, obj_path)
- def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+ def get_blob(
+ self,
+ folder: str,
+ sha1hex: str,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> bytes:
"""
sha1hex is sha1 of the blob itself
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index a3e2960..4e37036 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,39 +1,50 @@
-
import base64
-import magic
-import hashlib
import datetime
-from typing import Optional
+import hashlib
+import os
+from typing import List, Optional
+import magic
import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
def clean_url(s: str) -> str:
s = s.strip()
parsed = urlcanon.parse_url(s)
if not parsed.port and parsed.colon_before_port:
- parsed.colon_before_port = b''
+ parsed.colon_before_port = b""
return str(urlcanon.whatwg(parsed))
+
def url_fuzzy_equal(left: str, right: str) -> bool:
"""
TODO: use proper surt library and canonicalization for this check
"""
- fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
- fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
+ fuzzy_left = "://".join(
+ clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:]
+ )
+ fuzzy_right = "://".join(
+ clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:]
+ )
if fuzzy_left == fuzzy_right:
return True
elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
return True
return False
+
def test_url_fuzzy_equal() -> None:
- assert True == url_fuzzy_equal(
- "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
- "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree")
+ assert (
+ url_fuzzy_equal(
+ "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ )
+ is True
+ )
+
def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
"""
@@ -44,12 +55,15 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
assert blob is not None
if not allow_empty:
assert blob
- mimetype = magic.Magic(mime=True).from_buffer(blob)
+ if len(blob) < 1024 * 1024:
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ else:
+ mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)])
if mimetype in ("application/xml", "text/xml"):
# crude checks for XHTML or JATS XML, using only first 1 kB of file
if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
mimetype = "application/xhtml+xml"
- elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
mimetype = "application/jats+xml"
hashes = [
hashlib.sha1(),
@@ -66,6 +80,49 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
mimetype=mimetype,
)
+
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+ """
+ Variant of gen_file_metadata() which works with files on local disk
+ """
+ assert path is not None
+ mimetype = magic.Magic(mime=True).from_file(path)
+ if mimetype in ("application/xml", "text/xml"):
+ with open(path, "rb") as f:
+ blob = f.read(1024)
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if (
+ b"<htm" in blob[:1024]
+ and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]
+ ):
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+ mimetype = "application/jats+xml"
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ size_bytes = 0
+ with open(path, "rb") as f:
+ while True:
+ chunk = f.read(1024 * 1024)
+ if not chunk:
+ break
+ size_bytes += len(chunk)
+ for h in hashes:
+ h.update(chunk)
+ if not allow_empty:
+ assert size_bytes > 0
+ return dict(
+ size_bytes=size_bytes,
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
+
def b32_hex(s: str) -> str:
"""
Converts a base32-encoded SHA-1 checksum into hex-encoded
@@ -79,16 +136,18 @@ def b32_hex(s: str) -> str:
if len(s) == 40:
return s
raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
NORMAL_MIME = (
- 'application/pdf',
- 'application/postscript',
- 'text/html',
- 'text/xml',
- 'application/octet-stream',
+ "application/pdf",
+ "application/postscript",
+ "text/html",
+ "text/xml",
+ "application/octet-stream",
)
+
def normalize_mime(raw: str) -> Optional[str]:
raw = raw.lower().strip()
for norm in NORMAL_MIME:
@@ -96,28 +155,26 @@ def normalize_mime(raw: str) -> Optional[str]:
return norm
# Special cases
- if raw.startswith('application/xml'):
- return 'text/xml'
- if raw.startswith('application/x-pdf'):
- return 'application/pdf'
+ if raw.startswith("application/xml"):
+ return "text/xml"
+ if raw.startswith("application/x-pdf"):
+ return "application/pdf"
+ if raw in (".pdf",):
+ return "application/pdf"
if raw in (
- '.pdf',
- ):
- return 'application/pdf'
- if raw in (
- 'application/download',
- 'binary/octet-stream',
- 'unk',
- 'application/x-download',
- 'application/octetstream',
- 'application/force-download',
- 'application/unknown',
- ):
- return 'application/octet-stream'
+ "application/download",
+ "binary/octet-stream",
+ "unk",
+ "application/x-download",
+ "application/octetstream",
+ "application/force-download",
+ "application/unknown",
+ ):
+ return "application/octet-stream"
return None
-def test_normalize_mime():
+def test_normalize_mime() -> None:
assert normalize_mime("asdf") is None
assert normalize_mime("application/pdf") == "application/pdf"
assert normalize_mime("application/pdf+journal") == "application/pdf"
@@ -130,7 +187,7 @@ def test_normalize_mime():
assert normalize_mime("binary/octet-stream") == "application/octet-stream"
-def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
+def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
"""
This method always filters a few things out:
@@ -151,14 +208,19 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
offset = cdx[9]
warc = cdx[10]
- if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
- and len(sha1b32) == 32 and dt.isdigit()):
+ if not (
+ sha1b32.isalnum()
+ and c_size.isdigit()
+ and offset.isdigit()
+ and len(sha1b32) == 32
+ and dt.isdigit()
+ ):
return None
- if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+ if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
return None
- if mime is None or mime == '-':
+ if mime is None or mime == "-":
mime = "application/octet-stream"
if normalize:
@@ -179,6 +241,7 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
warc_path=warc,
)
+
def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
if not dt_str:
return None
@@ -187,23 +250,39 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
except Exception:
return None
+
def test_parse_cdx_datetime() -> None:
- assert parse_cdx_datetime("") == None
- assert parse_cdx_datetime("asdf") == None
- assert parse_cdx_datetime("19930203123045") != None
- assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+ assert parse_cdx_datetime("") is None
+ assert parse_cdx_datetime("asdf") is None
+ assert parse_cdx_datetime("19930203123045") is not None
+ assert parse_cdx_datetime("20201028235103") == datetime.datetime(
+ year=2020, month=10, day=28, hour=23, minute=51, second=3
+ )
+
def datetime_to_cdx(dt: datetime.datetime) -> str:
- return '%04d%02d%02d%02d%02d%02d' % (
- dt.year, dt.month, dt.day,
- dt.hour, dt.minute, dt.second,
+ return "%04d%02d%02d%02d%02d%02d" % (
+ dt.year,
+ dt.month,
+ dt.day,
+ dt.hour,
+ dt.minute,
+ dt.second,
)
+
def test_datetime_to_cdx() -> None:
- assert "20201028235103" == datetime_to_cdx(datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
+ assert "20201028235103" == datetime_to_cdx(
+ datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+ )
-def requests_retry_session(retries=10, backoff_factor=3,
- status_forcelist=(500, 502, 504), session=None) -> requests.Session:
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 1,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: Optional[requests.Session] = None,
+) -> requests.Session:
"""
From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
"""
@@ -216,7 +295,23 @@ def requests_retry_session(retries=10, backoff_factor=3,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
- session.mount('http://', adapter)
- session.mount('https://', adapter)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
return session
+
+def sanitize_fs_path(path: str) -> str:
+ """
+ From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
+ """
+ # - pretending to chroot to the current directory
+ # - cancelling all redundant paths (/.. = /)
+ # - making the path relative
+ return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
+
+
+def test_sanitize_fs_path() -> None:
+ assert sanitize_fs_path("/thing.png") == "thing.png"
+ assert sanitize_fs_path("../../thing.png") == "thing.png"
+ assert sanitize_fs_path("thing.png") == "thing.png"
+ assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png"
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 311bbf8..97d338e 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -1,22 +1,21 @@
-
-import sys
-import json
import datetime
-from io import BytesIO
+import json
+import sys
from dataclasses import dataclass
-from typing import Optional, Dict, Any
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
import poppler
from PIL import Image
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .ia import WaybackClient
from .misc import gen_file_metadata
-
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
# This is a hack to work around timeouts when processing certain PDFs with
# poppler. For some reason, the usual Kafka timeout catcher isn't working on
# these, maybe due to threading.
-BAD_PDF_SHA1HEX = [
+BAD_PDF_SHA1HEX: List[str] = [
"011478a1e63a2a31eae1a93832a74cc95f220760",
"018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
"057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
@@ -57,6 +56,7 @@ BAD_PDF_SHA1HEX = [
"43a8c0abf0386d3e3397cf5e22a884761dd63db7",
"445968ef735b228c08c3ff4238d99fc9f4824619",
"447fa6b5a90742a86429a932f6608d8e141688c0",
+ "45f014d7d631559dc7726e5c5513f1e7c91c48a9",
"47577ff6d6876117ca69bec60a5764f7d2c2ec70",
"4785181cec8944eee00ddb631a5dfc771b89bab7",
"47db2db2cc976429568841a0496c0ab4ed7b5977",
@@ -65,37 +65,49 @@ BAD_PDF_SHA1HEX = [
"4edc1402712fa6827c4501fed8042e9f4447829c",
"50b3c5a3122272aca69855ef06b85d0b43a76eb1",
"52fc9b3c5199ef395d410c7cee5961dc812e4d29",
+ "53471346019947a88c1ba141fb829375527153b0",
"58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff",
"59cfc843ebdb1c1e5db1efc76a40f46cb3bb06f0",
"5ab98405b676ee81a6ca74fba51a9e4a6cff7311",
+ "5c5b45c85eff07d4302844e00ec8baa57b988c60",
"5e04779cbbae5ce88bb786064f756885dd6895fe",
"5e6a3adde9f08c276c4efd72bfacb256f2ec35d9",
+ "62247fe6b8d3ca50477cafddbe24bf63832d6674",
"623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac",
"646c4a654270606256397684204ff0f3d17be2e7",
"64d821d728f9a3dc944b4c03be00feea0b57e314",
+ "668b7d777203af4b261d21bf4669fc9b385062e1",
"689b5cb3ddef213d612363a903f10d0358ea64d2",
"6909f0b62d8b7835de3dec7777aad7f8ef507ee3",
"74e617dc95555e8ca3aadd19d0c85b71cd77d1d9",
+ "7596438d77444a7c4228bb96fa4b394ba7d7e23b",
"75c2662a96ccc48891228df7c85eb7d4da9dd621",
"771f1ca0007a6fbed5b4a434c73f524f715d33c1",
"776859635e9dc01d97b0582f49c814ffbcb019fb",
"781dafda896a9f5c30f3d0a011f79a3b79b574c4",
"788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb",
"79d6cba3c6e577a0f3a3a9fe575680d38454938d",
+ "7b8b7e8e4b789579a7d2fda329db52528383a652",
+ "7c5c925cfb7c5a861b5c0a1d923308f9bedd335e",
+ "7cfc0739be9c49d94272110a0a748256bdde9be6",
"7daf61526ec825151f384cc1db510ca5237d5d80",
"7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76",
+ "800e47a7ed214f7acac85cc29aa7b0f9c0e218ae",
"8398b211a5ec4da1195a4ba1bc29ca8c0ac40f67",
"859d7ec532a0bf3b52b17c7f2d8ecc58410c0aad",
"88edcbab1cac2d70af5870422974afc253f4f0c6",
"89860fc475fcb2a2d86c4544df52ec8fd5e6533f",
"8dcaf4ef132900dd378f7be526c884b17452713b",
"8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
+ "8ec1a17ec19ae8ade95b9bdc837236981e83fffb",
"949dfb7d833da9576b2ccb9eb1ab5457469c53d3",
"961ec451172f373f919c593737466300e42062cb",
"976989fa6e447578d9ce16ec5b526f0e09d6df50",
+ "977f23723027d7052df9b49eb467e6c0b9af93ff",
"98b02eb70066c182c705ef4d14d8b723ad7f1fab",
"993ca31f6974f8387bb18dd7d38987d290da8781",
"9dbd05af3442e6f42d67868054751b76973f4171",
+ "a1cc781c694a48e018f4de110b58f561aa212051",
"a2298c137b9c8c8975bad62eea9224edb95e6952",
"a2671738755ab8b24775e95375dc72f1ca4e5fd6",
"a26f299fb97c646effeebd4c5e2968786bd0f781",
@@ -104,7 +116,9 @@ BAD_PDF_SHA1HEX = [
"a69665d0b5d3b95f54f68406eee3ed50c67efb45",
"a8357c31837404f9ebd798999d546c9398ab3648",
"a9162b9aef5e5da0897275fede1a6cff8cc93dfc",
+ "abc9d264df446707b40d7c9f79befd0f89291e59",
"ad038725bf6855a79f3c768ebe93c7103d14522f",
+ "aef581bf42e76e527f5aed3b8958fd4e7a24819f",
"b2b66b9c7f817a20144456f99c0be805602e8597",
"b2d719120306b90eb8dd3580b699a61ec70556f4",
"b4b8e18e27f102e59b2be2d58c7b54d0a0eb457a",
@@ -113,9 +127,11 @@ BAD_PDF_SHA1HEX = [
"b8b427e5b3d650ba9e03197f9c3917e25b878930",
"bad48b89b639b5b7df2c6a2d5288181fcb8b0e35",
"be0cda7642e9247b3ee41cd2017fa709aab4f344",
+ "beff1b0c24aa99989be73c66dfb1d1e7578e370b",
"c1b583fbd052572f08158d39ffe4d7510dadbebb",
"c2526f75a013dc67b14ce1e2d0e4fc80bb93c6e1",
"c4abbb284f4acaca9e8ceb88f842901984e84d33",
+ "c58e028269c8dfd3a442f6745c81b4c0e8610c43",
"c7220d1bf1e71fb755d9f26bbdd4c539dc162960",
"c7687fa6f637c7d32a25be0e772867d87536d35c",
"c7d8b37ec99cf0d987e60667f05299f200e18a5d",
@@ -126,101 +142,117 @@ BAD_PDF_SHA1HEX = [
"d055c054c330f99ec011e37186d2b429339758fd",
"d17b1e254cce82df5c6eb4fd492cef91e7e11558",
"d188762a7e3ab5d4ee8a897204316513e4e636ec",
+ "d613b9e4442f5d5d19ea6814fa9729bff7da7c85",
"d6b0f405bf13c23d0e90c54eea527442786d1cd3",
+ "d91d3830bf455e6dd782eee46218e35d29f07dfd",
"da2211ee2dbc6dda36571976d810e2366a3d2504",
+ "dbb3093a797e0ae83d39eb7b235ff85a17fd965c",
"e01bb7256d77aea258313bb410dfcfc10512f420",
"e2bf5d0a5885359381fe8ef2cd9290171d494e9b",
"e2c3b8a2cf33d5e8972bc9ddb78373766a75e412",
"e64714a81f60ab9286ec90cad682cb22e564fb6f",
"e9d7716b4f94bbc3d94459b5fe9bb8b15cb2e433",
+ "e9e84e17383e93a784a8471708619162b32fb399",
"eac7df5f799983d5a7cc55d10b4d426dc557febf",
+ "eaf84b2efd2f69c7b3f407f89ea66ac4c41fac36",
"eb1b39fd7a874896688855a22efddef10272427c",
"eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2",
+ "ecc4b927c5e84e145c610876931bc261ae13769b",
"edf8dcc8736f06afbaca0e01d60bd2c475403a3d",
+ "ee2ee6ae2cf05128810d0d95bbe69bd263e140de",
"ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7",
+ "ef1dfa325c21cff4cd8bb1a9b6c4ee6996d43c8f",
"ef6749d9263a01f921ba7d72df0d17671d14e5f6",
"f0ea221d8587cede25592266486e119d277f7096",
"f68f9a9202a75d2aee35252e104d796f9515001e",
"f9314d3bf2eac78a7d78d18adcccdb35542054ef",
+ "f932ef936021a3b00842b481478c40868b9a007c",
"fd9bd560662e070b222d63052830837829c490f0",
]
+
@dataclass
class PdfExtractResult:
sha1hex: str
status: str
error_msg: Optional[str] = None
- file_meta: Optional[Dict[str,Any]] = None
+ file_meta: Optional[Dict[str, Any]] = None
text: Optional[str] = None
page0_thumbnail: Optional[bytes] = None
has_page0_thumbnail: bool = False
meta_xml: Optional[str] = None
- pdf_info: Optional[Dict[str,Any]] = None
- pdf_extra: Optional[Dict[str,Any]] = None
- source: Optional[Dict[str,Any]] = None
+ pdf_info: Optional[Dict[str, Any]] = None
+ pdf_extra: Optional[Dict[str, Any]] = None
+ source: Optional[Dict[str, Any]] = None
def to_pdftext_dict(self) -> dict:
"""
Outputs a JSON string as would be published to Kafka text/info topic.
"""
return {
- 'key': self.sha1hex,
- 'sha1hex': self.sha1hex,
- 'status': self.status,
- 'file_meta': self.file_meta,
- 'error_msg': self.error_msg,
- 'text': self.text,
- 'has_page0_thumbnail': self.has_page0_thumbnail,
- 'meta_xml': self.meta_xml,
- 'pdf_info': self.pdf_info,
- 'pdf_extra': self.pdf_extra,
- 'source': self.source,
+ "key": self.sha1hex,
+ "sha1hex": self.sha1hex,
+ "status": self.status,
+ "file_meta": self.file_meta,
+ "error_msg": self.error_msg,
+ "text": self.text,
+ "has_page0_thumbnail": self.has_page0_thumbnail,
+ "meta_xml": self.meta_xml,
+ "pdf_info": self.pdf_info,
+ "pdf_extra": self.pdf_extra,
+ "source": self.source,
}
- @classmethod
- def from_pdftext_dict(cls, record):
+ @staticmethod
+ def from_pdftext_dict(record: Dict[str, Any]) -> "PdfExtractResult":
"""
Outputs a JSON string as would be published to Kafka text/info topic.
"""
- if record['status'] != 'success':
+ if record["status"] != "success":
return PdfExtractResult(
- sha1hex=record.get('sha1hex') or record['key'],
- status=record['status'],
- error_msg=record.get('error_msg'),
+ sha1hex=record.get("sha1hex") or record["key"],
+ status=record["status"],
+ error_msg=record.get("error_msg"),
)
else:
return PdfExtractResult(
- sha1hex=record['sha1hex'],
- status=record['status'],
- file_meta=record.get('file_meta'),
- text=record.get('text'),
- has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)),
- meta_xml=record.get('meta_xml'),
- pdf_info=record.get('pdf_info'),
- pdf_extra=record.get('pdf_extra'),
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ file_meta=record.get("file_meta"),
+ text=record.get("text"),
+ has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)),
+ meta_xml=record.get("meta_xml"),
+ pdf_info=record.get("pdf_info"),
+ pdf_extra=record.get("pdf_extra"),
)
- @classmethod
- def from_pdf_meta_dict(cls, record):
+ @staticmethod
+ def from_pdf_meta_dict(record: Dict[str, Any]) -> "PdfExtractResult":
"""
Parses what would be returned from postgrest
"""
- if record['status'] != 'success':
+ if record["status"] != "success":
return PdfExtractResult(
- sha1hex=record['sha1hex'],
- status=record['status'],
- error_msg=(record.get('metadata') or {}).get('error_msg'),
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ error_msg=(record.get("metadata") or {}).get("error_msg"),
)
else:
pdf_extra = dict()
- for k in ('page_count', 'page0_height', 'page0_width', 'permanent_id', 'pdf_version'):
+ for k in (
+ "page_count",
+ "page0_height",
+ "page0_width",
+ "permanent_id",
+ "pdf_version",
+ ):
if record.get(k):
pdf_extra[k] = record[k]
return PdfExtractResult(
- sha1hex=record['sha1hex'],
- status=record['status'],
- has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)),
- pdf_info=record.get('metadata'),
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)),
+ pdf_info=record.get("metadata"),
pdf_extra=pdf_extra,
)
@@ -237,31 +269,33 @@ class PdfExtractResult:
# TODO: form, encrypted
if self.pdf_info:
metadata = dict()
- for k in ('Title', 'Subject', 'Author', 'Creator', 'Producer', 'doi'):
+ for k in ("Title", "Subject", "Author", "Creator", "Producer", "doi"):
if k in self.pdf_info:
metadata[k.lower()] = self.pdf_info[k]
- if 'CreationDate' in self.pdf_info:
- pdf_created = self.pdf_info['CreationDate']
+ if "CreationDate" in self.pdf_info:
+ pdf_created = self.pdf_info["CreationDate"]
metadata_json: Optional[str] = None
if metadata:
metadata_json = json.dumps(metadata, sort_keys=True)
return (
self.sha1hex,
- datetime.datetime.now(), # updated
+ datetime.datetime.now(), # updated
self.status,
self.has_page0_thumbnail,
- pdf_extra.get('page_count'),
+ pdf_extra.get("page_count"),
word_count,
- pdf_extra.get('page0_height'),
- pdf_extra.get('page0_width'),
- pdf_extra.get('permanent_id'),
+ pdf_extra.get("page0_height"),
+ pdf_extra.get("page0_width"),
+ pdf_extra.get("permanent_id"),
pdf_created,
- pdf_extra.get('pdf_version'),
+ pdf_extra.get("pdf_version"),
metadata_json,
)
-def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult:
+def process_pdf(
+ blob: bytes, thumb_size: Tuple[int, int] = (180, 300), thumb_type: str = "JPEG"
+) -> PdfExtractResult:
"""
A known issue is that output text is in "physical layout" mode, which means
columns will be side-by-side. We would prefer a single stream of tokens!
@@ -271,11 +305,11 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
didn't seem to work at all (returned empty strings).
"""
file_meta = gen_file_metadata(blob)
- sha1hex = file_meta['sha1hex']
- if file_meta['mimetype'] != 'application/pdf':
+ sha1hex = file_meta["sha1hex"]
+ if file_meta["mimetype"] != "application/pdf":
return PdfExtractResult(
sha1hex=sha1hex,
- status='not-pdf',
+ status="not-pdf",
error_msg=f"mimetype is '{file_meta['mimetype']}'",
file_meta=file_meta,
)
@@ -283,8 +317,8 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
if sha1hex in BAD_PDF_SHA1HEX:
return PdfExtractResult(
sha1hex=sha1hex,
- status='bad-pdf',
- error_msg=f"PDF known to cause processing issues",
+ status="bad-pdf",
+ error_msg="PDF known to cause processing issues",
file_meta=file_meta,
)
@@ -294,7 +328,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
if pdf is None:
return PdfExtractResult(
sha1hex=sha1hex,
- status='empty-pdf',
+ status="empty-pdf",
file_meta=file_meta,
has_page0_thumbnail=False,
)
@@ -302,17 +336,18 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
if page0 is None:
return PdfExtractResult(
sha1hex=sha1hex,
- status='empty-page0',
+ status="empty-page0",
file_meta=file_meta,
)
# this call sometimes fails an returns an AttributeError
page0rect = page0.page_rect()
- except (AttributeError, poppler.document.LockedDocumentError) as e:
+ # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch
+ except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e:
# may need to expand the set of exceptions caught here over time, but
# starting with a narrow set
return PdfExtractResult(
sha1hex=sha1hex,
- status='parse-error',
+ status="parse-error",
error_msg=str(e),
file_meta=file_meta,
)
@@ -322,7 +357,9 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
renderer = poppler.PageRenderer()
try:
full_img = renderer.render_page(page0)
- img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', "BGRA", 0, 1)
+ img = Image.frombuffer(
+ "RGBA", (full_img.width, full_img.height), full_img.data, "raw", "BGRA", 0, 1
+ )
img.thumbnail(thumb_size, Image.BICUBIC)
buf = BytesIO()
img.save(buf, thumb_type)
@@ -342,23 +379,23 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
except AttributeError as e:
return PdfExtractResult(
sha1hex=sha1hex,
- status='parse-error',
+ status="parse-error",
error_msg=str(e),
file_meta=file_meta,
)
# Kafka message size limit; cap at about 1 MByte
- if len(full_text)> 1000000:
+ if len(full_text) > 1000000:
return PdfExtractResult(
sha1hex=sha1hex,
- status='text-too-large',
+ status="text-too-large",
error_msg="full_text chars: {}".format(len(full_text)),
file_meta=file_meta,
)
- if len(pdf.metadata)> 1000000:
+ if len(pdf.metadata) > 1000000:
return PdfExtractResult(
sha1hex=sha1hex,
- status='text-too-large',
+ status="text-too-large",
error_msg="meta_xml chars: {}".format(len(full_text)),
file_meta=file_meta,
)
@@ -368,7 +405,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
except UnicodeDecodeError:
return PdfExtractResult(
sha1hex=sha1hex,
- status='bad-unicode',
+ status="bad-unicode",
error_msg="in infos()",
file_meta=file_meta,
)
@@ -389,7 +426,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
return PdfExtractResult(
sha1hex=sha1hex,
file_meta=file_meta,
- status='success',
+ status="success",
error_msg=None,
text=full_text or None,
has_page0_thumbnail=page0_thumbnail is not None,
@@ -406,16 +443,21 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
),
)
-class PdfExtractWorker(SandcrawlerFetchWorker):
- def __init__(self, wayback_client=None, sink=None, **kwargs):
+class PdfExtractWorker(SandcrawlerFetchWorker):
+ def __init__(
+ self,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs,
+ ):
super().__init__(wayback_client=wayback_client)
self.wayback_client = wayback_client
self.sink = sink
- self.thumbnail_sink = kwargs.get('thumbnail_sink')
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
- def timeout_response(self, task) -> Dict:
- default_key = task['sha1hex']
+ def timeout_response(self, task: Dict[str, Any]) -> Dict[str, Any]:
+ default_key = task["sha1hex"]
return dict(
status="error-timeout",
error_msg="internal pdf-extract worker timeout",
@@ -423,13 +465,12 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
sha1hex=default_key,
)
- def process(self, record, key: Optional[str] = None):
- default_key = record['sha1hex']
-
+ def process(self, record: Any, key: Optional[str] = None) -> dict:
fetch_result = self.fetch_blob(record)
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
result = process_pdf(blob)
result.source = record
@@ -437,18 +478,19 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
return result.to_pdftext_dict()
+
class PdfExtractBlobWorker(SandcrawlerWorker):
"""
This is sort of like PdfExtractWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, sink=None, **kwargs):
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
super().__init__()
self.sink = sink
- self.thumbnail_sink = kwargs.get('thumbnail_sink')
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
- def process(self, blob, key: Optional[str] = None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
if not blob:
return None
assert isinstance(blob, bytes)
@@ -458,4 +500,3 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
return result.to_pdftext_dict()
-
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 161dc9c..112df6a 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,18 +1,19 @@
-
import time
+from typing import Any, Dict, Optional
+
import requests
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .ia import WaybackClient
from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class PdfTrioClient(object):
-
- def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
+ def __init__(self, host_url: str = "http://pdftrio.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- def classify_pdf(self, blob, mode="auto"):
+ def classify_pdf(self, blob: bytes, mode: str = "auto") -> Dict[str, Any]:
"""
Returns a dict with at least:
@@ -25,45 +26,43 @@ class PdfTrioClient(object):
appropriately; an optional `error_msg` may also be set. For some other
errors, like connection failure, an exception is raised.
"""
- assert blob
+ assert blob and type(blob) == bytes
try:
- pdftrio_response = requests.post(
+ pdftrio_response = self.http_session.post(
self.host_url + "/classify/research-pub/" + mode,
files={
- 'pdf_content': blob,
+ "pdf_content": blob,
},
timeout=60.0,
)
except requests.Timeout:
return {
- 'status': 'error-timeout',
- 'status_code': -4, # heritrix3 "HTTP timeout" code
- 'error_msg': 'pdftrio request (HTTP POST) timeout',
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "pdftrio request (HTTP POST) timeout",
}
except requests.exceptions.ConnectionError:
# crude back-off
time.sleep(2.0)
return {
- 'status': 'error-connect',
- 'status_code': -2, # heritrix3 "HTTP connect" code
- 'error_msg': 'pdftrio request connection timout',
+ "status": "error-connect",
+ "status_code": -2, # heritrix3 "HTTP connect" code
+ "error_msg": "pdftrio request connection timeout",
}
- info = dict(
- status_code=pdftrio_response.status_code,
- )
+ info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
if pdftrio_response.status_code == 200:
resp_json = pdftrio_response.json()
- assert 'ensemble_score' in resp_json
- assert 'status' in resp_json
- assert 'versions' in resp_json
+ assert "ensemble_score" in resp_json
+ assert "status" in resp_json
+ assert "versions" in resp_json
info.update(resp_json)
else:
- info['status'] = 'error'
+ info["status"] = "error"
# TODO: might return JSON with some info?
- info['_total_sec'] = pdftrio_response.elapsed.total_seconds()
+ info["_total_sec"] = pdftrio_response.elapsed.total_seconds()
return info
@@ -72,59 +71,72 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
This class is basically copied directly from GrobidWorker
"""
- def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
- super().__init__(wayback_client=wayback_client)
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs
+ ):
+ super().__init__(wayback_client=wayback_client, **kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
- def process(self, record, key=None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
start_process = time.time()
- default_key = record['sha1hex']
fetch_sec = None
start = time.time()
fetch_result = self.fetch_blob(record)
fetch_sec = time.time() - start
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
result = dict()
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
- result['source'] = record
- result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob)
+ result["source"] = record
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
total_sec=time.time() - start_process,
)
if fetch_sec:
- result['timing']['fetch_sec'] = fetch_sec
+ result["timing"]["fetch_sec"] = fetch_sec
return result
+
class PdfTrioBlobWorker(SandcrawlerWorker):
"""
This is sort of like PdfTrioWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs):
- super().__init__()
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ sink: Optional[SandcrawlerWorker] = None,
+ mode: str = "auto",
+ **kwargs
+ ):
+ super().__init__(**kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
self.mode = mode
- def process(self, blob, key=None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
start_process = time.time()
if not blob:
return None
+ assert isinstance(blob, bytes)
result = dict()
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
- result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
total_sec=time.time() - start_process,
)
return result
-
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 0fd54a4..f682572 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -1,4 +1,3 @@
-
"""
cdx
- read raw CDX, filter
@@ -20,106 +19,112 @@ grobid
"""
import os
-from typing import Optional, AnyStr
+import time
import xml.etree.ElementTree
+from typing import Any, Dict, List, Optional
+
+import psycopg2
+import requests
-from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgresClient
-from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.grobid import GrobidClient
+from sandcrawler.ingest_html import HtmlMetaRow
+from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.pdfextract import PdfExtractResult
-from sandcrawler.html_ingest import HtmlMetaRow
+from sandcrawler.workers import SandcrawlerWorker
class PersistCdxWorker(SandcrawlerWorker):
-
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
# filter to full CDX lines, no liveweb
- cdx_batch = [r for r in batch if r.get('warc_path') and ("/" in r['warc_path'])]
+ cdx_batch = [r for r in batch if r.get("warc_path") and ("/" in r["warc_path"])]
resp = self.db.insert_cdx(self.cur, cdx_batch)
if len(cdx_batch) < len(batch):
- self.counts['skip'] += len(batch) - len(cdx_batch)
- self.counts['insert-cdx'] += resp[0]
- self.counts['update-cdx'] += resp[1]
+ self.counts["skip"] += len(batch) - len(cdx_batch)
+ self.counts["insert-cdx"] += resp[0]
+ self.counts["update-cdx"] += resp[1]
self.db.commit()
return []
-class PersistIngestFileResultWorker(SandcrawlerWorker):
- def __init__(self, db_url, **kwargs):
+class PersistIngestFileResultWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def request_to_row(self, raw):
+ def request_to_row(self, raw: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Converts ingest-request JSON schema (eg, from Kafka) to SQL ingest_request schema
if there is a problem with conversion, return None
"""
# backwards compat hacks; transform request to look like current schema
- if raw.get('ingest_type') == 'file':
- raw['ingest_type'] = 'pdf'
- if (not raw.get('link_source')
- and raw.get('base_url')
- and raw.get('ext_ids', {}).get('doi')
- and raw['base_url'] == "https://doi.org/{}".format(raw['ext_ids']['doi'])):
+ if raw.get("ingest_type") == "file":
+ raw["ingest_type"] = "pdf"
+ if (
+ not raw.get("link_source")
+ and raw.get("base_url")
+ and raw.get("ext_ids", {}).get("doi")
+ and raw["base_url"] == "https://doi.org/{}".format(raw["ext_ids"]["doi"])
+ ):
# set link_source(_id) for old ingest requests
- raw['link_source'] = 'doi'
- raw['link_source_id'] = raw['ext_ids']['doi']
- if (not raw.get('link_source')
- and raw.get('ingest_request_source', '').startswith('savepapernow')
- and raw.get('fatcat', {}).get('release_ident')):
+ raw["link_source"] = "doi"
+ raw["link_source_id"] = raw["ext_ids"]["doi"]
+ if (
+ not raw.get("link_source")
+ and raw.get("ingest_request_source", "").startswith("savepapernow")
+ and raw.get("fatcat", {}).get("release_ident")
+ ):
# set link_source(_id) for old ingest requests
- raw['link_source'] = 'spn'
- raw['link_source_id'] = raw['fatcat']['release_ident']
+ raw["link_source"] = "spn"
+ raw["link_source_id"] = raw["fatcat"]["release_ident"]
- for k in ('ingest_type', 'base_url', 'link_source', 'link_source_id'):
- if not k in raw:
- self.counts['skip-request-fields'] += 1
+ for k in ("ingest_type", "base_url", "link_source", "link_source_id"):
+ if k not in raw:
+ self.counts["skip-request-fields"] += 1
return None
- if raw['ingest_type'] not in ('pdf', 'xml', 'html'):
- self.counts['skip-ingest-type'] += 1
+ if raw["ingest_type"] not in ("pdf", "xml", "html"):
+ self.counts["skip-ingest-type"] += 1
+ return None
+ # limit on base_url length
+ if len(raw["base_url"]) > 1500:
+ self.counts["skip-url-too-long"] += 1
return None
request = {
- 'ingest_type': raw['ingest_type'],
- 'base_url': raw['base_url'],
- 'link_source': raw['link_source'],
- 'link_source_id': raw['link_source_id'],
- 'ingest_request_source': raw.get('ingest_request_source'),
- 'request': {},
+ "ingest_type": raw["ingest_type"],
+ "base_url": raw["base_url"],
+ "link_source": raw["link_source"],
+ "link_source_id": raw["link_source_id"],
+ "ingest_request_source": raw.get("ingest_request_source"),
+ "request": {},
}
# extra/optional fields
- if raw.get('release_stage'):
- request['release_stage'] = raw['release_stage']
- if raw.get('fatcat', {}).get('release_ident'):
- request['request']['release_ident'] = raw['fatcat']['release_ident']
- for k in ('ext_ids', 'edit_extra', 'rel'):
+ if raw.get("release_stage"):
+ request["release_stage"] = raw["release_stage"]
+ if raw.get("fatcat", {}).get("release_ident"):
+ request["request"]["release_ident"] = raw["fatcat"]["release_ident"]
+ for k in ("ext_ids", "edit_extra", "rel"):
if raw.get(k):
- request['request'][k] = raw[k]
+ request["request"][k] = raw[k]
# if this dict is empty, trim it to save DB space
- if not request['request']:
- request['request'] = None
+ if not request["request"]:
+ request["request"] = None
return request
-
def file_result_to_row(self, raw: dict) -> Optional[dict]:
"""
@@ -127,208 +132,302 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if there is a problem with conversion, return None and set skip count
"""
- for k in ('request', 'hit', 'status'):
- if not k in raw:
- self.counts['skip-result-fields'] += 1
+ for k in ("request", "hit", "status"):
+ if k not in raw:
+ self.counts["skip-result-fields"] += 1
return None
- if not 'base_url' in raw['request']:
- self.counts['skip-result-fields'] += 1
+ if "base_url" not in raw["request"]:
+ self.counts["skip-result-fields"] += 1
return None
- ingest_type = raw['request'].get('ingest_type')
- if ingest_type == 'file':
- ingest_type = 'pdf'
- if ingest_type not in ('pdf', 'xml', 'html'):
- self.counts['skip-ingest-type'] += 1
+ ingest_type = raw["request"].get("ingest_type")
+ if ingest_type == "file":
+ ingest_type = "pdf"
+ if ingest_type not in (
+ "pdf",
+ "xml",
+ "html",
+ "component",
+ "src",
+ "dataset",
+ "dataset-file",
+ ):
+ self.counts["skip-ingest-type"] += 1
return None
- if raw['status'] in ("existing", ):
- self.counts['skip-existing'] += 1
+ if raw["status"] in ("existing",):
+ self.counts["skip-existing"] += 1
return None
result = {
- 'ingest_type': ingest_type,
- 'base_url': raw['request']['base_url'],
- 'hit': raw['hit'],
- 'status': raw['status'],
+ "ingest_type": ingest_type,
+ "base_url": raw["request"]["base_url"],
+ "hit": raw["hit"],
+ "status": raw["status"],
}
- terminal = raw.get('terminal')
+ terminal = raw.get("terminal")
if terminal:
- result['terminal_url'] = terminal.get('terminal_url') or terminal.get('url')
- result['terminal_dt'] = terminal.get('terminal_dt')
- result['terminal_status_code'] = terminal.get('terminal_status_code') or terminal.get('status_code') or terminal.get('http_code')
- if result['terminal_status_code']:
- result['terminal_status_code'] = int(result['terminal_status_code'])
- result['terminal_sha1hex'] = terminal.get('terminal_sha1hex')
+ result["terminal_url"] = terminal.get("terminal_url") or terminal.get("url")
+ result["terminal_dt"] = terminal.get("terminal_dt")
+ result["terminal_status_code"] = (
+ terminal.get("terminal_status_code")
+ or terminal.get("status_code")
+ or terminal.get("http_code")
+ )
+ if result["terminal_status_code"]:
+ result["terminal_status_code"] = int(result["terminal_status_code"])
+ result["terminal_sha1hex"] = terminal.get("terminal_sha1hex")
+ if len(result["terminal_url"]) > 2048:
+ # postgresql13 doesn't like extremely large URLs in b-tree index
+ self.counts["skip-huge-url"] += 1
+ return None
return result
def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]:
- html_body = record.get('html_body')
- file_meta = record.get('file_meta')
+ html_body = record.get("html_body")
+ file_meta = record.get("file_meta")
if not (file_meta and html_body):
return None
return HtmlMetaRow(
sha1hex=file_meta["sha1hex"],
- status=record.get('status'),
- scope=record.get('scope'),
- has_teixml=bool(html_body and html_body['status'] == 'success'),
+ status=record.get("status"),
+ scope=record.get("scope"),
+ has_teixml=bool(html_body and html_body["status"] == "success"),
has_thumbnail=False, # TODO
- word_count=(html_body and html_body.get('word_count')) or None,
- biblio=record.get('html_biblio'),
- resources=record.get('html_resources'),
+ word_count=(html_body and html_body.get("word_count")) or None,
+ biblio=record.get("html_biblio"),
+ resources=record.get("html_resources"),
)
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def result_to_platform_row(self, raw: dict) -> Optional[dict]:
+ """
+ Converts fileset ingest-result JSON schema (eg, from Kafka) to SQL ingest_fileset_platform schema
+
+ if there is a problem with conversion, return None and set skip count
+ """
+ for k in ("request", "hit", "status"):
+ if k not in raw:
+ return None
+ if "base_url" not in raw["request"]:
+ return None
+ ingest_type = raw["request"].get("ingest_type")
+ if ingest_type not in ("dataset"):
+ return None
+ if raw["status"] in ("existing",):
+ return None
+ if not raw.get("platform_name"):
+ return None
+ result = {
+ "ingest_type": ingest_type,
+ "base_url": raw["request"]["base_url"],
+ "hit": raw["hit"],
+ "status": raw["status"],
+ "platform_name": raw.get("platform_name"),
+ "platform_domain": raw.get("platform_domain"),
+ "platform_id": raw.get("platform_id"),
+ "ingest_strategy": raw.get("ingest_strategy"),
+ "total_size": raw.get("total_size"),
+ "file_count": raw.get("file_count"),
+ "archiveorg_item_name": raw.get("archiveorg_item_name"),
+ "archiveorg_item_bundle_path": None,
+ "web_bundle_url": None,
+ "web_bundle_dt": None,
+ "manifest": raw.get("manifest"),
+ }
+ if result.get("fileset_bundle"):
+ result["archiveorg_item_bundle_path"] = result["fileset_bundle"].get(
+ "archiveorg_item_bundle_path"
+ )
+ result["web_bundle_url"] = (
+ result["fileset_bundle"].get("terminal", {}).get("terminal_url")
+ )
+ result["web_bundle_dt"] = (
+ result["fileset_bundle"].get("terminal", {}).get("terminal_dt")
+ )
+ return result
+
+ def push_batch(self, batch: List[Any]) -> List[Any]:
+ self.counts["total"] += len(batch)
if not batch:
return []
- results = [self.file_result_to_row(raw) for raw in batch]
- results = [r for r in results if r]
+ results_unfiltered = [self.file_result_to_row(raw) for raw in batch]
+ results = [r for r in results_unfiltered if r]
- requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')]
- requests = [r for r in requests if r]
+ irequests_unfiltered = [
+ self.request_to_row(raw["request"]) for raw in batch if raw.get("request")
+ ]
+ irequests = [
+ r for r in irequests_unfiltered if r and r["ingest_type"] != "dataset-file"
+ ]
- if requests:
- resp = self.db.insert_ingest_request(self.cur, requests)
- self.counts['insert-requests'] += resp[0]
- self.counts['update-requests'] += resp[1]
+ if irequests:
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ self.counts["insert-requests"] += resp[0]
+ self.counts["update-requests"] += resp[1]
if results:
resp = self.db.insert_ingest_file_result(self.cur, results, on_conflict="update")
- self.counts['insert-results'] += resp[0]
- self.counts['update-results'] += resp[1]
+ self.counts["insert-results"] += resp[0]
+ self.counts["update-results"] += resp[1]
# these schemas match, so can just pass through
- cdx_batch = [r['cdx'] for r in batch if r.get('hit') and r.get('cdx')]
- revisit_cdx_batch = [r['revisit_cdx'] for r in batch if r.get('hit') and r.get('revisit_cdx')]
+ cdx_batch = [r["cdx"] for r in batch if r.get("hit") and r.get("cdx")]
+ revisit_cdx_batch = [
+ r["revisit_cdx"] for r in batch if r.get("hit") and r.get("revisit_cdx")
+ ]
cdx_batch.extend(revisit_cdx_batch)
# filter to full CDX lines, with full warc_paths (not liveweb)
- cdx_batch = [r for r in cdx_batch if r.get('warc_path') and ("/" in r['warc_path'])]
+ cdx_batch = [r for r in cdx_batch if r.get("warc_path") and ("/" in r["warc_path"])]
if cdx_batch:
resp = self.db.insert_cdx(self.cur, cdx_batch)
- self.counts['insert-cdx'] += resp[0]
- self.counts['update-cdx'] += resp[1]
+ self.counts["insert-cdx"] += resp[0]
+ self.counts["update-cdx"] += resp[1]
- file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')]
+ file_meta_batch = [r["file_meta"] for r in batch if r.get("hit") and r.get("file_meta")]
if file_meta_batch:
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing")
- self.counts['insert-file_meta'] += resp[0]
- self.counts['update-file_meta'] += resp[1]
+ self.counts["insert-file_meta"] += resp[0]
+ self.counts["update-file_meta"] += resp[1]
- html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body')]
+ html_meta_batch = [
+ self.result_to_html_meta(r) for r in batch if r.get("hit") and r.get("html_body")
+ ]
if html_meta_batch:
- resp = self.db.insert_html_meta(self.cur, html_meta_batch, on_conflict="update")
- self.counts['insert-html_meta'] += resp[0]
- self.counts['update-html_meta'] += resp[1]
+ rows = [d.to_sql_tuple() for d in html_meta_batch if d]
+ resp = self.db.insert_html_meta(self.cur, rows, on_conflict="update")
+ self.counts["insert-html_meta"] += resp[0]
+ self.counts["update-html_meta"] += resp[1]
+
+ fileset_platform_batch_all = [
+ self.result_to_platform_row(raw)
+ for raw in batch
+ if raw.get("request", {}).get("ingest_type") == "dataset"
+ and raw.get("platform_name")
+ ]
+ fileset_platform_batch: List[Dict] = [p for p in fileset_platform_batch_all if p]
+ if fileset_platform_batch:
+ resp = self.db.insert_ingest_fileset_platform(
+ self.cur, fileset_platform_batch, on_conflict="update"
+ )
+ self.counts["insert-fileset_platform"] += resp[0]
+ self.counts["update-fileset_platform"] += resp[1]
self.db.commit()
return []
-class PersistIngestRequestWorker(PersistIngestFileResultWorker):
- def __init__(self, db_url, **kwargs):
+class PersistIngestFilesetWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+
+class PersistIngestRequestWorker(PersistIngestFileResultWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__(db_url=db_url)
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
if not batch:
return []
- requests = [self.request_to_row(raw) for raw in batch]
- requests = [r for r in requests if r]
+ irequests_all = [self.request_to_row(raw) for raw in batch]
+ irequests: List[Dict] = [r for r in irequests_all if r]
- if requests:
- resp = self.db.insert_ingest_request(self.cur, requests)
- self.counts['insert-requests'] += resp[0]
- self.counts['update-requests'] += resp[1]
+ if irequests:
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ self.counts["insert-requests"] += resp[0]
+ self.counts["update-requests"] += resp[1]
self.db.commit()
return []
-class PersistGrobidWorker(SandcrawlerWorker):
- def __init__(self, db_url, **kwargs):
+class PersistGrobidWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.grobid = GrobidClient()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_only = kwargs.get('s3_only', False)
- self.db_only = kwargs.get('db_only', False)
+ self.s3_only = kwargs.get("s3_only", False)
+ self.db_only = kwargs.get("db_only", False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
if not self.s3_only:
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
+ self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
else:
self.db = None
self.cur = None
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
# filter out bad "missing status_code" timeout rows
- missing = [r for r in batch if not r.get('status_code')]
+ missing = [r for r in batch if not r.get("status_code")]
if missing:
- self.counts['skip-missing-status'] += len(missing)
- batch = [r for r in batch if r.get('status_code')]
+ self.counts["skip-missing-status"] += len(missing)
+ batch = [r for r in batch if r.get("status_code")]
for r in batch:
- if r['status_code'] != 200 or not r.get('tei_xml'):
- self.counts['s3-skip-status'] += 1
- if r.get('error_msg'):
- r['metadata'] = {'error_msg': r['error_msg'][:500]}
+ if r["status_code"] != 200 or not r.get("tei_xml"):
+ self.counts["s3-skip-status"] += 1
+ if r.get("error_msg"):
+ r["metadata"] = {"error_msg": r["error_msg"][:500]}
continue
- assert len(r['key']) == 40
+ assert len(r["key"]) == 40
if not self.db_only:
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder="grobid",
- blob=r['tei_xml'],
- sha1hex=r['key'],
+ blob=r["tei_xml"],
+ sha1hex=r["key"],
extension=".tei.xml",
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
- # enhance with teixml2json metadata, if available
+ # enhance with GROBID TEI-XML metadata, if available
try:
metadata = self.grobid.metadata(r)
except xml.etree.ElementTree.ParseError as xml_e:
- r['status'] = 'bad-grobid-xml'
- r['metadata'] = {'error_msg': str(xml_e)[:1024]}
+ r["status"] = "bad-grobid-xml"
+ r["metadata"] = {"error_msg": str(xml_e)[:1024]}
continue
if not metadata:
continue
- for k in ('fatcat_release', 'grobid_version'):
+ for k in ("fatcat_release", "grobid_version"):
r[k] = metadata.pop(k, None)
- if r.get('fatcat_release'):
- r['fatcat_release'] = r['fatcat_release'].replace('release_', '')
- if metadata.get('grobid_timestamp'):
- r['updated'] = metadata['grobid_timestamp']
- r['metadata'] = metadata
+ if r.get("fatcat_release"):
+ r["fatcat_release"] = r["fatcat_release"].replace("release_", "")
+ if metadata.get("grobid_timestamp"):
+ r["updated"] = metadata["grobid_timestamp"]
+ r["metadata"] = metadata
if not self.s3_only:
+ assert self.db and self.cur
resp = self.db.insert_grobid(self.cur, batch, on_conflict="update")
- self.counts['insert-grobid'] += resp[0]
- self.counts['update-grobid'] += resp[1]
+ self.counts["insert-grobid"] += resp[0]
+ self.counts["update-grobid"] += resp[1]
- file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')]
+ file_meta_batch = [r["file_meta"] for r in batch if r.get("file_meta")]
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
@@ -342,11 +441,11 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
This could be refactored into a "Sink" type with an even thinner wrapper.
"""
- def __init__(self, output_dir):
+ def __init__(self, output_dir: str):
super().__init__()
self.output_dir = output_dir
- def _blob_path(self, sha1hex, extension=".tei.xml"):
+ def _blob_path(self, sha1hex: str, extension: str = ".tei.xml") -> str:
obj_path = "{}/{}/{}{}".format(
sha1hex[0:2],
sha1hex[2:4],
@@ -355,48 +454,49 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
)
return obj_path
- def process(self, record, key=None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
- if record.get('status_code') != 200 or not record.get('tei_xml'):
+ if record.get("status_code") != 200 or not record.get("tei_xml"):
return False
- assert(len(record['key'])) == 40
- p = "{}/{}".format(self.output_dir, self._blob_path(record['key']))
+ assert (len(record["key"])) == 40
+ p = "{}/{}".format(self.output_dir, self._blob_path(record["key"]))
os.makedirs(os.path.dirname(p), exist_ok=True)
- with open(p, 'w') as f:
- f.write(record.pop('tei_xml'))
- self.counts['written'] += 1
+ with open(p, "w") as f:
+ f.write(record.pop("tei_xml"))
+ self.counts["written"] += 1
return record
class PersistPdfTrioWorker(SandcrawlerWorker):
-
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
- batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')]
+ batch = [r for r in batch if "pdf_trio" in r and r["pdf_trio"].get("status_code")]
for r in batch:
# copy key (sha1hex) into sub-object
- r['pdf_trio']['key'] = r['key']
- pdftrio_batch = [r['pdf_trio'] for r in batch]
+ r["pdf_trio"]["key"] = r["key"]
+ pdftrio_batch = [r["pdf_trio"] for r in batch]
resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update")
- self.counts['insert-pdftrio'] += resp[0]
- self.counts['update-pdftrio'] += resp[1]
-
- file_meta_batch = [r['file_meta'] for r in batch if r['pdf_trio']['status'] == "success" and r.get('file_meta')]
+ self.counts["insert-pdftrio"] += resp[0]
+ self.counts["update-pdftrio"] += resp[1]
+
+ file_meta_batch = [
+ r["file_meta"]
+ for r in batch
+ if r["pdf_trio"]["status"] == "success" and r.get("file_meta")
+ ]
resp = self.db.insert_file_meta(self.cur, file_meta_batch)
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
return []
@@ -409,63 +509,63 @@ class PersistPdfTextWorker(SandcrawlerWorker):
Should keep batch sizes small.
"""
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_only = kwargs.get('s3_only', False)
- self.db_only = kwargs.get('db_only', False)
+ self.s3_only = kwargs.get("s3_only", False)
+ self.db_only = kwargs.get("db_only", False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
if not self.s3_only:
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
+ self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
else:
self.db = None
self.cur = None
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
parsed_batch = []
for r in batch:
parsed_batch.append(PdfExtractResult.from_pdftext_dict(r))
for r in parsed_batch:
- if r.status != 'success' or not r.text:
- self.counts['s3-skip-status'] += 1
+ if r.status != "success" or not r.text:
+ self.counts["s3-skip-status"] += 1
if r.error_msg:
- r.metadata = {'error_msg': r.error_msg[:500]}
+ r.metadata = {"error_msg": r.error_msg[:500]}
continue
assert len(r.sha1hex) == 40
if not self.db_only:
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder="text",
blob=r.text,
sha1hex=r.sha1hex,
extension=".txt",
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
if not self.s3_only:
- resp = self.db.insert_pdf_meta(self.cur, parsed_batch, on_conflict="update")
- self.counts['insert-pdf-meta'] += resp[0]
- self.counts['update-pdf-meta'] += resp[1]
+ assert self.db and self.cur
+ rows = [r.to_sql_tuple() for r in parsed_batch]
+ resp = self.db.insert_pdf_meta(self.cur, rows, on_conflict="update")
+ self.counts["insert-pdf-meta"] += resp[0]
+ self.counts["update-pdf-meta"] += resp[1]
file_meta_batch = [r.file_meta for r in parsed_batch if r.file_meta]
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
@@ -484,32 +584,33 @@ class PersistThumbnailWorker(SandcrawlerWorker):
def __init__(self, **kwargs):
super().__init__()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_extension = kwargs.get('s3_extension', ".jpg")
- self.s3_folder = kwargs.get('s3_folder', "pdf")
+ self.s3_extension = kwargs.get("s3_extension", ".jpg")
+ self.s3_folder = kwargs.get("s3_folder", "pdf")
- def process(self, blob: bytes, key: Optional[str] = None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
"""
Processing raw messages, not decoded JSON objects
"""
+ assert isinstance(record, bytes)
+ blob: bytes = record
if isinstance(key, bytes):
- key = key.decode('utf-8')
+ key = key.decode("utf-8")
assert key is not None and len(key) == 40 and isinstance(key, str)
- assert isinstance(blob, bytes)
assert len(blob) >= 50
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder=self.s3_folder,
blob=blob,
sha1hex=key,
extension=self.s3_extension,
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
class GenericPersistDocWorker(SandcrawlerWorker):
@@ -522,36 +623,36 @@ class GenericPersistDocWorker(SandcrawlerWorker):
def __init__(self, **kwargs):
super().__init__()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_extension = kwargs.get('s3_extension', ".unknown")
- self.s3_folder = kwargs.get('s3_folder', "unknown")
+ self.s3_extension = kwargs.get("s3_extension", ".unknown")
+ self.s3_folder = kwargs.get("s3_folder", "unknown")
self.doc_key = "unknown"
- def process(self, record: dict, key: Optional[AnyStr] = None) -> None:
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
- if record.get('status') != 'success' or not record.get(self.doc_key):
+ if record.get("status") != "success" or not record.get(self.doc_key):
return
assert key is not None
if isinstance(key, bytes):
- key_str = key.decode('utf-8')
+ key_str = key.decode("utf-8")
elif isinstance(key, str):
key_str = key
assert len(key_str) == 40
- if 'sha1hex' in record:
- assert key_str == record['sha1hex']
+ if "sha1hex" in record:
+ assert key_str == record["sha1hex"]
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder=self.s3_folder,
- blob=record[self.doc_key].encode('utf-8'),
+ blob=record[self.doc_key].encode("utf-8"),
sha1hex=key_str,
extension=self.s3_extension,
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
class PersistXmlDocWorker(GenericPersistDocWorker):
@@ -562,8 +663,8 @@ class PersistXmlDocWorker(GenericPersistDocWorker):
def __init__(self, **kwargs):
super().__init__(**kwargs)
- self.s3_extension = kwargs.get('s3_extension', ".jats.xml")
- self.s3_folder = kwargs.get('s3_folder', "xml_doc")
+ self.s3_extension = kwargs.get("s3_extension", ".jats.xml")
+ self.s3_folder = kwargs.get("s3_folder", "xml_doc")
self.doc_key = "jats_xml"
@@ -575,6 +676,110 @@ class PersistHtmlTeiXmlWorker(GenericPersistDocWorker):
def __init__(self, **kwargs):
super().__init__(**kwargs)
- self.s3_extension = kwargs.get('s3_extension', ".tei.xml")
- self.s3_folder = kwargs.get('s3_folder', "html_body")
+ self.s3_extension = kwargs.get("s3_extension", ".tei.xml")
+ self.s3_folder = kwargs.get("s3_folder", "html_body")
self.doc_key = "tei_xml"
+
+
+class PersistCrossrefWorker(SandcrawlerWorker):
+ """
+ Pushes Crossref API JSON records into postgresql. Can also talk to GROBID,
+ parsed 'unstructured' references, and push the results in to postgresql at
+ the same time.
+ """
+
+ def __init__(
+ self,
+ db_url: str,
+ grobid_client: Optional[GrobidClient],
+ parse_refs: bool = True,
+ **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ if grobid_client:
+ self.grobid_client = grobid_client
+ else:
+ self.grobid_client = GrobidClient()
+ self.parse_refs = parse_refs
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ crossref_batch = []
+ refs_batch = []
+ for record in batch:
+ crossref_batch.append(
+ dict(
+ doi=record["DOI"].lower().strip(),
+ indexed=record["indexed"]["date-time"],
+ record=record,
+ )
+ )
+ if self.parse_refs:
+ try:
+ parsed_refs = self.grobid_client.crossref_refs(record)
+ refs_batch.append(parsed_refs)
+ except (
+ xml.etree.ElementTree.ParseError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.ReadTimeout,
+ ):
+ print("GROBID crossref refs parsing error, skipping with a sleep")
+ time.sleep(3)
+ pass
+
+ resp = self.db.insert_crossref(self.cur, crossref_batch)
+ if len(crossref_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(crossref_batch)
+ self.counts["insert-crossref"] += resp[0]
+ self.counts["update-crossref"] += resp[1]
+
+ if refs_batch:
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
+
+ self.db.commit()
+ return []
+
+
+class PersistGrobidRefsWorker(SandcrawlerWorker):
+ """
+ Simple persist worker to backfill GROBID references in to postgresql
+ locally. Consumes the JSON output from GROBID CrossrefRefsWorker.
+ """
+
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__(**kwargs)
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ refs_batch = []
+ for record in batch:
+ assert record["source"]
+ assert record["source_id"]
+ refs_batch.append(record)
+
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
+
+ self.db.commit()
+ return []
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 37e3d7a..356f050 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,16 +1,22 @@
-
-import sys
import json
-import time
+import multiprocessing.pool
import signal
+import sys
+import time
import zipfile
-import requests
-import multiprocessing.pool
from collections import Counter
-from confluent_kafka import Consumer, Producer, KafkaException
+from typing import Any, Dict, List, Optional, Sequence
-from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
+from confluent_kafka import Consumer, KafkaException, Producer
+
+from .ia import (
+ PetaboxError,
+ SandcrawlerBackoffError,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+)
+from .misc import parse_cdx_line, requests_retry_session
class SandcrawlerWorker(object):
@@ -21,31 +27,30 @@ class SandcrawlerWorker(object):
worker (pipeline-style), or defaults to stdout.
"""
- def __init__(self):
- self.counts = Counter()
- self.sink = None
- # TODO: self.counters
+ def __init__(self, sink: Optional["SandcrawlerWorker"] = None):
+ self.counts: Counter = Counter()
+ self.sink: Optional[SandcrawlerWorker] = sink
- def push_record(self, task, key=None):
- self.counts['total'] += 1
+ def push_record(self, task: Any, key: Optional[str] = None) -> Any:
+ self.counts["total"] += 1
if not self.want(task):
- self.counts['skip'] += 1
+ self.counts["skip"] += 1
return
result = self.process(task, key=key)
if not result:
- self.counts['failed'] += 1
+ self.counts["failed"] += 1
return
- elif type(result) == dict and 'status' in result and len(result['status']) < 32:
- self.counts[result['status']] += 1
+ elif type(result) == dict and "status" in result and len(result["status"]) < 32:
+ self.counts[result["status"]] += 1
if self.sink:
self.sink.push_record(result)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
else:
print(json.dumps(result))
return result
- def timeout_response(self, task):
+ def timeout_response(self, task: Any) -> Any:
"""
This should be overridden by workers that want to return something
meaningful when there is a processing timeout. Eg, JSON vs some other
@@ -53,7 +58,9 @@ class SandcrawlerWorker(object):
"""
return None
- def push_record_timeout(self, task, key=None, timeout=300):
+ def push_record_timeout(
+ self, task: Any, key: Optional[str] = None, timeout: int = 300
+ ) -> Any:
"""
A wrapper around self.push_record which sets a timeout.
@@ -62,49 +69,52 @@ class SandcrawlerWorker(object):
same process.
"""
- def timeout_handler(signum, frame):
+ def timeout_handler(signum: int, frame: Any) -> None:
raise TimeoutError("timeout processing record")
+
signal.signal(signal.SIGALRM, timeout_handler)
resp = None
signal.alarm(int(timeout))
try:
resp = self.push_record(task, key=key)
except TimeoutError:
- self.counts['timeout'] += 1
- resp = self.timeout_response(task) # pylint: disable=assignment-from-none
+ self.counts["timeout"] += 1
+ resp = self.timeout_response(task) # pylint: disable=assignment-from-none
# TODO: what if it is this push_record() itself that is timing out?
if resp and self.sink:
self.sink.push_record(resp)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
elif resp:
print(json.dumps(resp))
finally:
signal.alarm(0)
return resp
- def push_batch(self, tasks):
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
results = []
for task in tasks:
results.append(self.push_record(task))
return results
- def finish(self):
+ def finish(self) -> Counter:
if self.sink:
self.sink.finish()
print("Worker: {}".format(self.counts), file=sys.stderr)
return self.counts
- def want(self, task):
+ def want(self, task: Any) -> bool:
"""
Optionally override this as a filter in implementations.
"""
return True
- def process(self, task, key=None):
+ def process(self, task: Any, key: Optional[str] = None) -> Any:
"""
Derived workers need to implement business logic here.
+
+ TODO: should derived workers explicitly type-check the 'task' object?
"""
- raise NotImplementedError('implementation required')
+ raise NotImplementedError("implementation required")
class SandcrawlerFetchWorker(SandcrawlerWorker):
@@ -113,26 +123,26 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
PDFs) from wayback, archive.org, or other sources.
"""
- def __init__(self, wayback_client, **kwargs):
+ def __init__(self, wayback_client: Optional[WaybackClient], **kwargs):
super().__init__(**kwargs)
self.wayback_client = wayback_client
+ self.http_session = requests_retry_session()
- def fetch_blob(self, record):
- start_process = time.time()
- default_key = record['sha1hex']
+ def fetch_blob(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ default_key = record["sha1hex"]
wayback_sec = None
petabox_sec = None
- if record.get('warc_path') and record.get('warc_offset'):
+ if record.get("warc_path") and record.get("warc_offset"):
# it's a full CDX dict. fetch using WaybackClient
if not self.wayback_client:
- raise Exception("wayback client not configured for this PdfTrioWorker")
+ raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
try:
start = time.time()
- blob = self.wayback_client.fetch_petabox_body(
- csize=record['warc_csize'],
- offset=record['warc_offset'],
- warc_path=record['warc_path'],
+ blob: bytes = self.wayback_client.fetch_petabox_body(
+ csize=record["warc_csize"],
+ offset=record["warc_offset"],
+ warc_path=record["warc_path"],
)
wayback_sec = time.time() - start
except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we:
@@ -142,15 +152,15 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
status="error-wayback",
error_msg=str(we),
)
- elif record.get('url') and record.get('datetime'):
+ elif record.get("url") and record.get("datetime"):
# it's a partial CDX dict or something? fetch using WaybackClient
if not self.wayback_client:
- raise Exception("wayback client not configured for this PdfTrioWorker")
+ raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
try:
start = time.time()
blob = self.wayback_client.fetch_replay_body(
- url=record['url'],
- datetime=record['datetime'],
+ url=record["url"],
+ datetime=record["datetime"],
)
wayback_sec = time.time() - start
except (WaybackError, WaybackContentError) as we:
@@ -160,14 +170,15 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
status="error-wayback",
error_msg=str(we),
)
- elif record.get('item') and record.get('path'):
+ elif record.get("item") and record.get("path"):
# it's petabox link; fetch via HTTP
start = time.time()
- resp = requests.get("https://archive.org/serve/{}/{}".format(
- record['item'], record['path']))
+ ia_resp = self.http_session.get(
+ "https://archive.org/serve/{}/{}".format(record["item"], record["path"])
+ )
petabox_sec = time.time() - start
try:
- resp.raise_for_status()
+ ia_resp.raise_for_status()
except Exception as e:
return dict(
key=default_key,
@@ -175,55 +186,67 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
status="error-petabox",
error_msg=str(e),
)
- blob = resp.content
+ blob = ia_resp.content
else:
- raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
+ raise ValueError(
+ "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed"
+ )
if not blob:
return dict(
key=default_key,
source=record,
status="empty-blob",
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
)
return dict(
key=default_key,
status="success",
source=record,
blob=blob,
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
)
-class MultiprocessWrapper(SandcrawlerWorker):
- def __init__(self, worker, sink, jobs=None):
+class MultiprocessWrapper(SandcrawlerWorker):
+ def __init__(
+ self,
+ worker: SandcrawlerWorker,
+ sink: Optional[SandcrawlerWorker] = None,
+ jobs: Optional[int] = None,
+ ):
self.counts = Counter()
self.worker = worker
self.sink = sink
self.pool = multiprocessing.pool.Pool(jobs)
- def push_batch(self, tasks):
- self.counts['total'] += len(tasks)
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
+ self.counts["total"] += len(tasks)
print("... processing batch of: {}".format(len(tasks)), file=sys.stderr)
results = self.pool.map(self.worker.process, tasks)
for result in results:
if not result:
- self.counts['failed'] += 1
- return
- elif type(result) == dict and 'status' in result and len(result['status']) < 32:
- self.counts[result['status']] += 1
+ self.counts["failed"] += 1
+ return []
+ elif type(result) == dict and "status" in result and len(result["status"]) < 32:
+ self.counts[result["status"]] += 1
if self.sink:
self.sink.push_record(result)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
else:
print(json.dumps(result))
return results
- def finish(self):
+ def finish(self) -> Counter:
self.pool.terminate()
if self.sink:
self.sink.finish()
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("Multiprocessing: {}".format(self.counts), file=sys.stderr)
- return worker_counts
+ return self.counts
+
class BlackholeSink(SandcrawlerWorker):
"""
@@ -232,73 +255,73 @@ class BlackholeSink(SandcrawlerWorker):
Useful for tests.
"""
- def push_record(self, task, key=None):
+ def push_record(self, task: Any, key: Optional[str] = None) -> Any:
return
- def push_batch(self, tasks):
- return
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
+ return []
-class KafkaSink(SandcrawlerWorker):
- def __init__(self, kafka_hosts, produce_topic, **kwargs):
+class KafkaSink(SandcrawlerWorker):
+ def __init__(self, kafka_hosts: str, produce_topic: str, **kwargs):
self.sink = None
self.counts = Counter()
self.produce_topic = produce_topic
self.kafka_hosts = kafka_hosts
- config = self.producer_config({
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 30000000, # ~30 MBytes; broker is ~50 MBytes
- 'api.version.request': True,
- 'api.version.fallback.ms': 0,
- })
+ config = self.producer_config(
+ {
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 30000000, # ~30 MBytes; broker is ~50 MBytes
+ "api.version.request": True,
+ "api.version.fallback.ms": 0,
+ }
+ )
self.producer = Producer(config)
-
@staticmethod
- def _fail_fast(err, msg):
+ def _fail_fast(err: Any, msg: Any) -> None:
if err is not None:
print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
- def producer_config(self, kafka_config):
+ def producer_config(self, kafka_config: dict) -> dict:
config = kafka_config.copy()
- config.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'message.timeout.ms': 30000,
- 'request.required.acks': -1, # all brokers must confirm
+ config.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "message.timeout.ms": 30000,
+ "request.required.acks": -1, # all brokers must confirm
+ },
}
- })
+ )
return config
- def push_record(self, msg, key=None):
- self.counts['total'] += 1
+ def push_record(self, msg: Any, key: Optional[str] = None) -> Any:
+ self.counts["total"] += 1
if type(msg) == dict:
- if not key and 'key' in msg:
- key = msg['key']
+ if not key and "key" in msg:
+ key = msg["key"]
msg = json.dumps(msg)
if type(msg) == str:
- msg = msg.encode('utf-8')
+ msg = msg.encode("utf-8")
assert type(msg) == bytes
- self.producer.produce(
- self.produce_topic,
- msg,
- key=key,
- on_delivery=self._fail_fast)
- self.counts['produced'] += 1
+ self.producer.produce(self.produce_topic, msg, key=key, on_delivery=self._fail_fast)
+ self.counts["produced"] += 1
# check for errors etc
self.producer.poll(0)
- def push_batch(self, msgs):
+ def push_batch(self, msgs: List[Any]) -> List[Any]:
for m in msgs:
self.push_record(m)
+ return []
- def finish(self):
+ def finish(self) -> Counter:
self.producer.flush()
return self.counts
@@ -308,19 +331,21 @@ class KafkaCompressSink(KafkaSink):
Variant of KafkaSink for large documents. Used for, eg, GROBID output.
"""
- def producer_config(self, kafka_config):
+ def producer_config(self, kafka_config: Dict[str, Any]) -> Dict[str, Any]:
config = kafka_config.copy()
- config.update({
- 'compression.codec': 'gzip',
- 'retry.backoff.ms': 250,
- 'linger.ms': 1000,
- 'batch.num.messages': 50,
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'message.timeout.ms': 30000,
- 'request.required.acks': -1, # all brokers must confirm
+ config.update(
+ {
+ "compression.codec": "gzip",
+ "retry.backoff.ms": 250,
+ "linger.ms": 1000,
+ "batch.num.messages": 50,
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "message.timeout.ms": 30000,
+ "request.required.acks": -1, # all brokers must confirm
+ },
}
- })
+ )
return config
@@ -330,11 +355,11 @@ class RecordPusher:
trivial interface, just wraps an importer and pushes records in to it.
"""
- def __init__(self, worker, **kwargs):
- self.counts = Counter()
- self.worker = worker
+ def __init__(self, worker: SandcrawlerWorker, **kwargs):
+ self.counts: Counter = Counter()
+ self.worker: SandcrawlerWorker = worker
- def run(self):
+ def run(self) -> Counter:
"""
This will look something like:
@@ -347,133 +372,140 @@ class RecordPusher:
class JsonLinePusher(RecordPusher):
-
- def __init__(self, worker, json_file, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, json_file: Sequence, **kwargs):
self.counts = Counter()
self.worker = worker
self.json_file = json_file
- self.batch_size = kwargs.get('batch_size', None)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
for line in self.json_file:
if not line:
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
try:
record = json.loads(line)
except json.decoder.JSONDecodeError:
- self.counts['error-json-decode'] += 1
+ self.counts["error-json-decode"] += 1
continue
if self.batch_size:
batch.append(record)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(record)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("JSON lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class CdxLinePusher(RecordPusher):
-
- def __init__(self, worker, cdx_file, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, cdx_file: Sequence, **kwargs):
self.counts = Counter()
self.worker = worker
self.cdx_file = cdx_file
- self.filter_http_statuses = kwargs.get('filter_http_statuses', None)
- self.filter_mimetypes = kwargs.get('filter_mimetypes', None)
- self.allow_octet_stream = kwargs.get('allow_octet_stream', False)
- self.batch_size = kwargs.get('batch_size', None)
+ self.filter_http_statuses = kwargs.get("filter_http_statuses", None)
+ self.filter_mimetypes = kwargs.get("filter_mimetypes", None)
+ self.allow_octet_stream = kwargs.get("allow_octet_stream", False)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
for line in self.cdx_file:
if not line:
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
record = parse_cdx_line(line, normalize=True)
if not record:
- self.counts['skip-parse'] += 1
+ self.counts["skip-parse"] += 1
continue
- if self.filter_http_statuses and record['http_status'] not in self.filter_http_statuses:
- self.counts['skip-http_status'] += 1
+ if (
+ self.filter_http_statuses
+ and record["http_status"] not in self.filter_http_statuses
+ ):
+ self.counts["skip-http_status"] += 1
continue
- if self.filter_mimetypes and record['mimetype'] not in self.filter_mimetypes:
- self.counts['skip-mimetype'] += 1
+ if self.filter_mimetypes and record["mimetype"] not in self.filter_mimetypes:
+ self.counts["skip-mimetype"] += 1
continue
if self.batch_size:
batch.append(record)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(record)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("CDX lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class ZipfilePusher(RecordPusher):
-
- def __init__(self, worker, zipfile_path, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, zipfile_path: str, **kwargs):
self.counts = Counter()
self.worker = worker
self.filter_suffix = ".pdf"
self.zipfile_path = zipfile_path
- self.batch_size = kwargs.get('batch_size', None)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
- with zipfile.ZipFile(self.zipfile_path, 'r') as archive:
+ with zipfile.ZipFile(self.zipfile_path, "r") as archive:
for zipinfo in archive.infolist():
if not zipinfo.filename.endswith(self.filter_suffix):
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
# NB doesn't really extract the file, just gives you a stream (file-like-object) for reading it
- flo = archive.open(zipinfo, 'r')
+ flo = archive.open(zipinfo, "r")
data = flo.read(2**32)
flo.close()
if self.batch_size:
batch.append(data)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(data)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
-class KafkaJsonPusher(RecordPusher):
- def __init__(self, worker, kafka_hosts, consume_topic, group, **kwargs):
+class KafkaJsonPusher(RecordPusher):
+ def __init__(
+ self,
+ worker: SandcrawlerWorker,
+ kafka_hosts: str,
+ consume_topic: str,
+ group: str,
+ **kwargs
+ ):
self.counts = Counter()
self.worker = worker
self.consumer = make_kafka_consumer(
@@ -481,29 +513,32 @@ class KafkaJsonPusher(RecordPusher):
consume_topic,
group,
)
- self.push_batches = kwargs.get('push_batches', False)
- self.raw_records = kwargs.get('raw_records', False)
- self.poll_interval = kwargs.get('poll_interval', 5.0)
- self.batch_size = kwargs.get('batch_size', 100)
+ self.push_batches = kwargs.get("push_batches", False)
+ self.raw_records = kwargs.get("raw_records", False)
+ self.poll_interval = kwargs.get("poll_interval", 5.0)
+ self.batch_size = kwargs.get("batch_size", 100)
if self.batch_size in (0, 1):
self.batch_size = 1
- self.batch_worker = kwargs.get('batch_worker', False)
- self.process_timeout_sec = kwargs.get('process_timeout_sec', 300)
+ self.batch_worker = kwargs.get("batch_worker", False)
+ self.process_timeout_sec = kwargs.get("process_timeout_sec", 300)
- def run(self):
+ def run(self) -> Counter:
while True:
# TODO: this is batch-oriented, because underlying worker is
# often batch-oriented, but this doesn't confirm that entire batch
- # has been pushed to fatcat before commiting offset. Eg, consider
+ # has been pushed to fatcat before committing offset. Eg, consider
# case where there there is one update and thousands of creates;
# update would be lingering in worker, and if worker crashed
# never created. Not great.
batch = self.consumer.consume(
- num_messages=self.batch_size,
- timeout=self.poll_interval)
- print("... got {} kafka messages ({}sec poll interval)".format(
- len(batch), self.poll_interval),
- file=sys.stderr)
+ num_messages=self.batch_size, timeout=self.poll_interval
+ )
+ print(
+ "... got {} kafka messages ({}sec poll interval)".format(
+ len(batch), self.poll_interval
+ ),
+ file=sys.stderr,
+ )
if not batch:
# TODO: could have some larger timeout here and
# self.worker.finish() if it's been more than, eg, a couple
@@ -515,14 +550,14 @@ class KafkaJsonPusher(RecordPusher):
raise KafkaException(msg.error())
# ... then process
if self.push_batches:
- self.counts['total'] += len(batch)
- records = [json.loads(msg.value().decode('utf-8')) for msg in batch]
+ self.counts["total"] += len(batch)
+ records = [json.loads(msg.value().decode("utf-8")) for msg in batch]
self.worker.push_batch(records)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
else:
for msg in batch:
- self.counts['total'] += 1
+ self.counts["total"] += 1
if self.raw_records:
# In this mode, pass the Kafka message as bytes through
# without decoding as JSON. Eg, for thumbnails (where
@@ -530,7 +565,7 @@ class KafkaJsonPusher(RecordPusher):
# from the message)
record = msg.value()
else:
- record = json.loads(msg.value().decode('utf-8'))
+ record = json.loads(msg.value().decode("utf-8"))
# This complex bit of code implements backoff/backpressure
# in a way that will not cause this Kafka consumer to lose
# partition assignments (resulting in a rebalance). This
@@ -540,7 +575,9 @@ class KafkaJsonPusher(RecordPusher):
while not done:
try:
# use timeouts; don't want kafka itself to timeout
- self.worker.push_record_timeout(record, key=msg.key(), timeout=self.process_timeout_sec)
+ self.worker.push_record_timeout(
+ record, key=msg.key(), timeout=self.process_timeout_sec
+ )
break
except SandcrawlerBackoffError as be:
print("Backing off for 200 seconds: {}".format(be))
@@ -552,8 +589,8 @@ class KafkaJsonPusher(RecordPusher):
assert not empty_batch
time.sleep(5)
self.consumer.resume(self.consumer.assignment())
- self.counts['pushed'] += 1
- if self.counts['total'] % 500 == 0:
+ self.counts["pushed"] += 1
+ if self.counts["total"] % 500 == 0:
print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
for msg in batch:
# locally store offsets of processed messages; will be
@@ -562,16 +599,16 @@ class KafkaJsonPusher(RecordPusher):
# TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
# commit the current batch if it has been lingering
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("KafkaJson lines pushed: {}".format(self.counts), file=sys.stderr)
self.consumer.close()
return self.counts
-def make_kafka_consumer(hosts, consume_topic, group):
+def make_kafka_consumer(hosts: str, consume_topic: str, group: str) -> Consumer:
topic_name = consume_topic
- def fail_fast(err, partitions):
+ def fail_fast(err: Any, partitions: List[Any]) -> None:
if err is not None:
print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
print("Bailing out...", file=sys.stderr)
@@ -584,40 +621,41 @@ def make_kafka_consumer(hosts, consume_topic, group):
print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(p.error)
- #print("Kafka consumer commit successful")
+ # print("Kafka consumer commit successful")
pass
# previously, using pykafka
- #auto_commit_enable=True,
- #auto_commit_interval_ms=30000, # 30 seconds
+ # auto_commit_enable=True,
+ # auto_commit_interval_ms=30000, # 30 seconds
conf = {
- 'bootstrap.servers': hosts,
- 'group.id': group,
- 'on_commit': fail_fast,
+ "bootstrap.servers": hosts,
+ "group.id": group,
+ "on_commit": fail_fast,
# messages don't have offset marked as stored until processed,
# but we do auto-commit stored offsets to broker
- 'enable.auto.offset.store': False,
- 'enable.auto.commit': True,
+ "enable.auto.offset.store": False,
+ "enable.auto.commit": True,
# user code timeout; if no poll after this long, assume user code
# hung and rebalance (default: 6min)
- 'max.poll.interval.ms': 360000,
- 'default.topic.config': {
- 'auto.offset.reset': 'latest',
+ "max.poll.interval.ms": 360000,
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
},
}
- def on_rebalance(consumer, partitions):
+ def on_rebalance(consumer: Any, partitions: List[Any]) -> None:
for p in partitions:
if p.error:
raise KafkaException(p.error)
- print("Kafka partitions rebalanced: {} / {}".format(
- consumer, partitions),
- file=sys.stderr)
+ print(
+ "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr
+ )
consumer = Consumer(conf)
# NOTE: it's actually important that topic_name *not* be bytes (UTF-8
# encoded)
- consumer.subscribe([topic_name],
+ consumer.subscribe(
+ [topic_name],
on_assign=on_rebalance,
on_revoke=on_rebalance,
)
diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py
index 7a0086d..83d53d4 100644
--- a/python/sandcrawler/xml.py
+++ b/python/sandcrawler/xml.py
@@ -1,4 +1,3 @@
-
import xml.etree.ElementTree as ET
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 6be8bac..aebcbe1 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -1,26 +1,23 @@
#!/usr/bin/env python3
-
"""
These are generally for continuously running workers that consume from Kafka.
Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
or S3 (SeaweedFS).
"""
+import argparse
import os
+import subprocess
import sys
-import argparse
-import datetime
-import raven
-from sandcrawler import *
-from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker
+import sentry_sdk
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-try:
- git_sha = raven.fetch_git_sha('..')
-except Exception as e:
- git_sha = None
-sentry_client = raven.Client(release=git_sha)
+from sandcrawler import *
+from sandcrawler.persist import (
+ PersistCrossrefWorker,
+ PersistHtmlTeiXmlWorker,
+ PersistXmlDocWorker,
+)
def run_grobid_extract(args):
@@ -50,6 +47,7 @@ def run_grobid_extract(args):
)
pusher.run()
+
def run_pdf_extract(args):
consume_topic = "sandcrawler-{}.unextracted".format(args.env)
pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
@@ -80,6 +78,7 @@ def run_pdf_extract(args):
)
pusher.run()
+
def run_persist_grobid(args):
consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
worker = PersistGrobidWorker(
@@ -94,6 +93,8 @@ def run_persist_grobid(args):
kafka_group = "persist-grobid"
if args.s3_only:
kafka_group += "-s3"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
@@ -104,6 +105,7 @@ def run_persist_grobid(args):
)
pusher.run()
+
def run_persist_pdftext(args):
consume_topic = "sandcrawler-{}.pdf-text".format(args.env)
worker = PersistPdfTextWorker(
@@ -118,6 +120,8 @@ def run_persist_pdftext(args):
kafka_group = "persist-pdf-text"
if args.s3_only:
kafka_group += "-s3"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
@@ -128,6 +132,7 @@ def run_persist_pdftext(args):
)
pusher.run()
+
def run_persist_thumbnail(args):
consume_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
worker = PersistThumbnailWorker(
@@ -138,17 +143,21 @@ def run_persist_thumbnail(args):
s3_extension=".180px.jpg",
s3_folder="pdf",
)
+ kafka_group = "persist-pdf-thumbnail"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-pdf-thumbnail",
+ group=kafka_group,
push_batches=False,
raw_records=True,
batch_size=25,
)
pusher.run()
+
def run_persist_xml_doc(args: argparse.Namespace) -> None:
consume_topic = f"sandcrawler-{args.env}.xml-doc"
worker = PersistXmlDocWorker(
@@ -157,16 +166,20 @@ def run_persist_xml_doc(args: argparse.Namespace) -> None:
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
)
+ kafka_group = "persist-xml-doc"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-xml-doc",
+ group=kafka_group,
push_batches=False,
batch_size=25,
)
pusher.run()
+
def run_persist_html_teixml(args: argparse.Namespace) -> None:
consume_topic = f"sandcrawler-{args.env}.html-teixml"
worker = PersistHtmlTeiXmlWorker(
@@ -175,16 +188,20 @@ def run_persist_html_teixml(args: argparse.Namespace) -> None:
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
)
+ kafka_group = "persist-html-teixml"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-html-teixml",
+ group=kafka_group,
push_batches=False,
batch_size=25,
)
pusher.run()
+
def run_persist_pdftrio(args):
consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
worker = PersistPdfTrioWorker(
@@ -200,13 +217,20 @@ def run_persist_pdftrio(args):
)
pusher.run()
+
def run_ingest_file(args):
+ spn_cdx_retry_sec = 9.0
if args.bulk:
consume_group = "sandcrawler-{}-ingest-file-bulk".format(args.env)
consume_topic = "sandcrawler-{}.ingest-file-requests-bulk".format(args.env)
+ elif args.priority:
+ spn_cdx_retry_sec = 45.0
+ consume_group = "sandcrawler-{}-ingest-file-priority".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-priority".format(args.env)
else:
+ spn_cdx_retry_sec = 1.0
consume_group = "sandcrawler-{}-ingest-file".format(args.env)
- consume_topic = "sandcrawler-{}.ingest-file-requests".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env)
produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
@@ -248,8 +272,9 @@ def run_ingest_file(args):
pdftext_sink=pdftext_sink,
xmldoc_sink=xmldoc_sink,
htmlteixml_sink=htmlteixml_sink,
- # don't SPNv2 for --bulk backfill
- try_spn2=not args.bulk,
+ # don't SPNv2 for --bulk or --skip-spn
+ try_spn2=not (args.bulk or args.skip_spn),
+ spn_cdx_retry_sec=spn_cdx_retry_sec,
)
pusher = KafkaJsonPusher(
worker=worker,
@@ -260,6 +285,7 @@ def run_ingest_file(args):
)
pusher.run()
+
def run_persist_ingest_file(args):
consume_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
worker = PersistIngestFileResultWorker(
@@ -275,96 +301,195 @@ def run_persist_ingest_file(args):
)
pusher.run()
+
+def run_persist_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ consume_topic = "fatcat-{}.api-crossref".format(args.env)
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-crossref",
+ push_batches=True,
+ # small batch size because doing GROBID processing
+ batch_size=batch_size,
+ )
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('--grobid-host',
- default="http://grobid.qa.fatcat.wiki",
- help="GROBID API host/port")
- parser.add_argument('--db-url',
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "--kafka-group-suffix", default="", help="Kafka consumer group suffix (optional)"
+ )
+ parser.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ parser.add_argument(
+ "--db-url",
help="postgresql database connection string",
- default="postgres:///sandcrawler")
- parser.add_argument('--s3-url',
- help="S3 (seaweedfs) backend URL",
- default="localhost:9000")
- parser.add_argument('--s3-access-key',
+ default="postgres:///sandcrawler",
+ )
+ parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000")
+ parser.add_argument(
+ "--s3-access-key",
help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
- parser.add_argument('--s3-secret-key',
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_ACCESS_KEY"),
+ )
+ parser.add_argument(
+ "--s3-secret-key",
help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY') or os.environ.get('MINIO_SECRET_KEY'))
- parser.add_argument('--s3-bucket',
- help="S3 (seaweedfs) bucket to persist into",
- default="sandcrawler-dev")
+ default=os.environ.get("SANDCRAWLER_BLOB_SECRET_KEY")
+ or os.environ.get("MINIO_SECRET_KEY"),
+ )
+ parser.add_argument(
+ "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev"
+ )
subparsers = parser.add_subparsers()
- sub_grobid_extract = subparsers.add_parser('grobid-extract',
- help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka")
+ sub_grobid_extract = subparsers.add_parser(
+ "grobid-extract",
+ help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka",
+ )
sub_grobid_extract.set_defaults(func=run_grobid_extract)
- sub_pdf_extract = subparsers.add_parser('pdf-extract',
- help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka")
+ sub_pdf_extract = subparsers.add_parser(
+ "pdf-extract",
+ help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka",
+ )
sub_pdf_extract.set_defaults(func=run_pdf_extract)
- sub_persist_grobid = subparsers.add_parser('persist-grobid',
- help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres")
- sub_persist_grobid.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_persist_grobid.add_argument('--db-only',
- action='store_true',
- help="only write status to database (don't upload TEI-XML to S3)")
+ sub_persist_grobid = subparsers.add_parser(
+ "persist-grobid",
+ help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_grobid.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_persist_grobid.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to database (don't upload TEI-XML to S3)",
+ )
sub_persist_grobid.set_defaults(func=run_persist_grobid)
- sub_persist_pdftext = subparsers.add_parser('persist-pdftext',
- help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres")
- sub_persist_pdftext.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_persist_pdftext.add_argument('--db-only',
- action='store_true',
- help="only write status to database (don't upload TEI-XML to S3)")
+ sub_persist_pdftext = subparsers.add_parser(
+ "persist-pdftext",
+ help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_pdftext.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_persist_pdftext.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to database (don't upload TEI-XML to S3)",
+ )
sub_persist_pdftext.set_defaults(func=run_persist_pdftext)
- sub_persist_thumbnail = subparsers.add_parser('persist-thumbnail',
- help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres")
+ sub_persist_thumbnail = subparsers.add_parser(
+ "persist-thumbnail",
+ help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
- sub_persist_xml_doc = subparsers.add_parser('persist-xml-doc',
- help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_xml_doc = subparsers.add_parser(
+ "persist-xml-doc",
+ help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket",
+ )
sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc)
- sub_persist_html_teixml = subparsers.add_parser('persist-html-teixml',
- help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_html_teixml = subparsers.add_parser(
+ "persist-html-teixml",
+ help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket",
+ )
sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml)
- sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
- help="daemon that consumes pdftrio output from Kafka and pushes to postgres")
+ sub_persist_pdftrio = subparsers.add_parser(
+ "persist-pdftrio",
+ help="daemon that consumes pdftrio output from Kafka and pushes to postgres",
+ )
sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)
- sub_ingest_file = subparsers.add_parser('ingest-file',
- help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka")
- sub_ingest_file.add_argument('--bulk',
- action='store_true',
- help="consume from bulk kafka topic (eg, for ingest backfill)")
+ sub_ingest_file = subparsers.add_parser(
+ "ingest-file",
+ help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka",
+ )
+ sub_ingest_file.add_argument(
+ "--bulk",
+ action="store_true",
+ help="consume from bulk kafka topic (eg, for ingest backfill)",
+ )
+ sub_ingest_file.add_argument(
+ "--skip-spn",
+ action="store_true",
+ help="don't do SPN lookups",
+ )
+ sub_ingest_file.add_argument(
+ "--priority",
+ action="store_true",
+ help="consume from priority kafka topic (eg, for SPN requests)",
+ )
sub_ingest_file.set_defaults(func=run_ingest_file)
- sub_persist_ingest_file = subparsers.add_parser('persist-ingest-file',
- help="daemon that consumes ingest-file output from Kafka and pushes to postgres")
+ sub_persist_ingest_file = subparsers.add_parser(
+ "persist-ingest-file",
+ help="daemon that consumes ingest-file output from Kafka and pushes to postgres",
+ )
sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file)
+ sub_persist_crossref = subparsers.add_parser(
+ "persist-crossref",
+ help="daemon that persists crossref to postgres; also does GROBID ref transform",
+ )
+ sub_persist_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ sub_persist_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
+ sub_persist_crossref.set_defaults(func=run_persist_crossref)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 03a1f29..4561541 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
This script is intended to be used for backfill ingest of old crawls. It can
also be used as a fast path for getting freshly crawled content into fatcat if
@@ -12,9 +11,9 @@ Run like:
Can then run through requests using that tool, or dump into kafka queue.
"""
-import sys
-import json
import argparse
+import json
+import sys
def run(args):
@@ -22,51 +21,54 @@ def run(args):
if not l.strip():
continue
row = json.loads(l)
- if not row['hit']:
+ if not row["hit"]:
continue
request = {
- 'base_url': row['final_url'],
- 'ingest_type': args.ingest_type,
- 'link_source': args.link_source,
- 'link_source_id': row['identifier'],
- 'ingest_request_source': args.ingest_request_source,
- 'ext_ids': {
- args.extid_type: row['identifier'],
+ "base_url": row["final_url"],
+ "ingest_type": args.ingest_type,
+ "link_source": args.link_source,
+ "link_source_id": row["identifier"],
+ "ingest_request_source": args.ingest_request_source,
+ "ext_ids": {
+ args.extid_type: row["identifier"],
},
}
if args.release_stage:
- assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
- request['release_stage'] = args.release_stage
+ assert args.release_stage in (
+ "published",
+ "submitted",
+ "accepted",
+ "draft",
+ "update",
+ )
+ request["release_stage"] = args.release_stage
print("{}".format(json.dumps(request, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--link-source',
- required=True,
- help="link_source to include in request")
- parser.add_argument('--extid-type',
- required=True,
- help="extid to encode identifier as")
- parser.add_argument('--ingest-type',
- default="pdf",
- help="ingest type (pdf, html, xml, etc)")
- parser.add_argument('--ingest-request-source',
- default="arabesque",
- help="to include in request")
- parser.add_argument('--release-stage',
- default=None,
- help="to include in request")
- parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--link-source", required=True, help="link_source to include in request"
+ )
+ parser.add_argument("--extid-type", required=True, help="extid to encode identifier as")
+ parser.add_argument(
+ "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)"
+ )
+ parser.add_argument(
+ "--ingest-request-source", default="arabesque", help="to include in request"
+ )
+ parser.add_argument("--release-stage", default=None, help="to include in request")
+ parser.add_argument(
+ "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
new file mode 100755
index 0000000..6328f52
--- /dev/null
+++ b/python/scripts/archiveorg_fileset.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""
+Helper script to
+
+Takes either two args (release ident and archive.org item), or a stream of
+tab-separated such pairs on stdin.
+
+TODO:
+- should this check the item type?
+"""
+
+import json
+import sys
+from typing import Any
+
+import internetarchive
+
+FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+}
+
+
+def want_file(f: dict, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+
+def parse_file(f: dict) -> dict:
+ """
+ Takes an IA API file and turns it in to a fatcat fileset manifest file
+ """
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = {
+ "path": f.name,
+ "size": int(f.size),
+ "sha1": f.sha1,
+ "md5": f.md5,
+ }
+ # TODO: will disable this hard check eventually and replace with:
+ # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+ mimetype = FORMAT_TO_MIMETYPE[f.format]
+ if mimetype:
+ mf["extra"] = dict(mimetype=mimetype)
+ return mf
+
+
+def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
+ print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
+ if release_id.startswith("release_"):
+ release_id = release_id[9:]
+ assert len(release_id) == 26
+ item = session.get_item(item_name)
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
+ fileset = {
+ "manifest": manifest,
+ "urls": [
+ {
+ "rel": "archive",
+ "url": f"https://archive.org/download/{item_name}/",
+ },
+ ],
+ "release_ids": [release_id],
+ # extra={},
+ }
+ print(json.dumps(fileset))
+ return fileset
+
+
+def main():
+ session = internetarchive.get_session()
+ if len(sys.argv) == 3:
+ item_name = sys.argv[1]
+ release_id = sys.argv[2]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+ else:
+ for line in sys.stdin:
+ line = line.strip()
+ if not line:
+ continue
+ fields = line.split("\t")
+ assert len(fields) == 2
+ item_name = fields[0]
+ release_id = fields[1]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..0b60da3
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+ ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import internetarchive as ia
+import requests
+
+
+def run():
+
+ if len(sys.argv) != 2:
+ print("Expected a single argument (collection name)")
+ sys.exit(-1)
+
+ collection = sys.argv[1]
+
+ # Check collection name is clean
+ assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum()
+
+ tempdir = tempfile.mkdtemp()
+ print("Looking up collection: {}".format(collection))
+
+ # First fetch list
+ item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
+
+ if len(item_list) == 0:
+ print("No items found, bailing")
+ sys.exit(-1)
+
+ print("Found {} potential items".format(len(item_list)))
+ status = True
+ errors = []
+ for item in item_list:
+ item = item["identifier"]
+ # TODO: error handling
+ try:
+ ret = ia.download(
+ item,
+ files=[item + ".cdx.gz"],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000,
+ )
+ status = ret and status
+ except requests.exceptions.ReadTimeout as rt:
+ print(str(rt), file=sys.stderr)
+ errors.append(rt)
+ continue
+
+ if errors:
+ print("## Download Errors", file=sys.stderr)
+ for e in errors:
+ print(e, file=sys.stderr)
+
+ # Combine files
+ print("Merging and re-compressing all CDX files...")
+ # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True)
+
+ # Move and cleanup
+ shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection))
+
+ print("Done!")
+
+
+if __name__ == "__main__":
+ run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 33c425d..e3bf4f0 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python3
-
"""
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
@@ -18,38 +18,44 @@ def canon(s):
def transform_cnki(obj):
requests = []
- assert obj['cnki_id']
-
+ assert obj["cnki_id"]
requests = []
- requests.append({
- 'base_url': canon(obj['info_url']),
- 'ingest_type': 'pdf',
- 'link_source': 'cnki_covid19',
- 'link_source_id': obj['cnki_id'],
- 'ingest_request_source': 'scrape-covid19',
- })
- if 'read_url' in obj:
- requests.append({
- 'base_url': canon(obj['read_url']),
- 'ingest_type': 'pdf', # actually HTML
- 'link_source': 'cnki_covid19',
- 'link_source_id': obj['cnki_id'],
- 'ingest_request_source': 'scrape-covid19',
- })
+ requests.append(
+ {
+ "base_url": canon(obj["info_url"]),
+ "ingest_type": "pdf",
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
+ if "read_url" in obj:
+ requests.append(
+ {
+ "base_url": canon(obj["read_url"]),
+ "ingest_type": "pdf", # actually HTML
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
return requests
+
def transform_wanfang(obj):
- assert obj['wanfang_id']
- return [{
- 'base_url': canon(obj['url']),
- 'ingest_type': 'pdf',
- 'link_source': 'wanfang_covid19',
- 'link_source_id': obj['wanfang_id'],
- 'ingest_request_source': 'scrape-covid19',
- }]
+ assert obj["wanfang_id"]
+ return [
+ {
+ "base_url": canon(obj["url"]),
+ "ingest_type": "pdf",
+ "link_source": "wanfang_covid19",
+ "link_source_id": obj["wanfang_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ ]
def run(args):
@@ -58,26 +64,27 @@ def run(args):
continue
row = json.loads(l)
- if 'wanfang_id' in row:
+ if "wanfang_id" in row:
requests = transform_wanfang(row) or []
- elif 'cnki_id' in row:
+ elif "cnki_id" in row:
requests = transform_cnki(row) or []
else:
continue
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="COVID-19 metadata file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..27ccf21 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -19,23 +19,20 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
"""
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+import sentry_sdk
def b32_hex(s):
@@ -45,81 +42,80 @@ def b32_hex(s):
s = s[5:]
if len(s) != 32:
return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
-class DeliverDumpGrobidS3():
+class DeliverDumpGrobidS3:
def __init__(self, s3_bucket, **kwargs):
self.rstore = None
self.count = Counter()
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
- self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
- self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "grobid/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml")
+ self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def run(self, dump_file):
sys.stderr.write("Starting...\n")
for line in dump_file:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, grobid_json = line[0], line[1]
if len(sha1_hex) != 40:
sha1_hex = b32_hex(sha1_hex)
assert len(sha1_hex) == 40
grobid = json.loads(grobid_json)
- tei_xml = grobid.get('tei_xml')
+ tei_xml = grobid.get("tei_xml")
if not tei_xml:
print("{}\tskip empty".format(sha1_hex))
- self.count['skip-empty'] += 1
+ self.count["skip-empty"] += 1
continue
- tei_xml = tei_xml.encode('utf-8')
+ tei_xml = tei_xml.encode("utf-8")
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
Body=tei_xml,
StorageClass=self.s3_storage_class,
)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="grobid/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".tei.xml",
- help='file suffix for created objects')
- parser.add_argument('--s3-storage-class',
- type=str,
- default="STANDARD",
- help='AWS S3 storage class (redundancy) to use')
- parser.add_argument('dump_file',
- help="TSV/JSON dump file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix",
+ type=str,
+ default="grobid/",
+ help="key prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--s3-storage-class",
+ type=str,
+ default="STANDARD",
+ help="AWS S3 storage class (redundancy) to use",
+ )
+ parser.add_argument(
+ "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r")
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index 3dcf962..093f32a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -7,160 +7,191 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
-import raven
+import sentry_sdk
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
class DeliverGwbDisk:
-
def __init__(self, disk_dir, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.disk_dir = disk_dir
- self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
- self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+ self.disk_prefix = kwargs.get("disk_prefix", "pdf/")
+ self.disk_suffix = kwargs.get("disk_suffix", ".pdf")
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
+ return None, dict(
+ status="error",
reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Ensuring all 65536 base directories exist...\n")
for i in range(256):
for j in range(256):
- fpath = "{}/{}{:02x}/{:02x}".format(
- self.disk_dir,
- self.disk_prefix,
- i,
- j)
+ fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)
os.makedirs(fpath, exist_ok=True)
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# save to disk
fpath = "{}/{}{}/{}/{}{}".format(
- self.disk_dir,
- self.disk_prefix,
- sha1_hex[0:2],
- sha1_hex[2:4],
- sha1_hex,
- self.disk_suffix)
- with open(fpath, 'wb') as f:
+ self.disk_dir,
+ self.disk_prefix,
+ sha1_hex[0:2],
+ sha1_hex[2:4],
+ sha1_hex,
+ self.disk_suffix,
+ )
+ with open(fpath, "wb") as f:
f.write(blob)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
- self.count['success-disk'] += 1
+ self.count["success-disk"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--disk-dir',
- required=True,
- type=str,
- help='local base directory to save into')
- parser.add_argument('--disk-prefix',
- type=str,
- default="pdf/",
- help='directory prefix for items created in bucket')
- parser.add_argument('--disk-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created files')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--disk-dir", required=True, type=str, help="local base directory to save into"
+ )
+ parser.add_argument(
+ "--disk-prefix",
+ type=str,
+ default="pdf/",
+ help="directory prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--disk-suffix", type=str, default=".pdf", help="file suffix for created files"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbDisk(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 39ac000..6f37ede 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -24,7 +24,7 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
- wayback/GWB libraries
"""
@@ -33,152 +33,180 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
import boto3
-import raven
+import sentry_sdk
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
class DeliverGwbS3:
-
def __init__(self, s3_bucket, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
- self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "pdf/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".pdf")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
+ return None, dict(
+ status="error",
reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
- Body=blob)
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
+ Body=blob,
+ )
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="pdf/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created objects')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket"
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index b981ab6..aef5c12 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Transform an DOAJ article dump (JSON) into ingest requests.
@@ -9,31 +8,31 @@ in the HTML headers and adds an ingest request on that basis. Or even just run
the re-ingest in-process and publish a second result.
"""
-import sys
-import json
import argparse
+import json
+import sys
+from typing import List, Optional
+
import urlcanon
-from typing import Optional, List
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- #"semanticscholar.org/",
+ # "semanticscholar.org/",
"://doi.org/",
+ "://dx.doi.org/",
"zenodo.org/",
"figshare.com/",
"://archive.org/",
".archive.org/",
-
# large publishers/platforms; may remove in the future
- #"://link.springer.com/",
- #"://dergipark.gov.tr/",
- #"frontiersin.org/",
- #"scielo",
+ # "://link.springer.com/",
+ # "://dergipark.gov.tr/",
+ # "frontiersin.org/",
+ # "scielo",
]
# these default to PDF; note that we also do pdf ingests for HTML pages
@@ -41,78 +40,83 @@ CONTENT_TYPE_MAP = {
"abstract": [],
"doc": [],
"": ["pdf"],
-
"doi": ["pdf"],
"url": ["pdf"],
"fulltext": ["pdf"],
"anySimpleType": ["pdf"],
-
"application/pdf": ["pdf"],
"html": ["html", "pdf"],
"text/html": ["html", "pdf"],
"xml": ["xml"],
}
+
def canon(s: str) -> str:
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj: dict) -> List[dict]:
"""
Transforms from a single DOAJ object to zero or more ingest requests.
Returns a list of dicts.
"""
- doaj_id = obj['id'].lower()
+ doaj_id = obj["id"].lower()
assert doaj_id
- bibjson = obj['bibjson']
- if not bibjson['link']:
+ bibjson = obj["bibjson"]
+ if not bibjson["link"]:
return []
requests = []
doi: Optional[str] = None
- for ident in (bibjson['identifier'] or []):
- if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
- doi = ident['id'].lower()
+ for ident in bibjson["identifier"] or []:
+ if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+ doi = ident["id"].lower()
- for link in (bibjson['link'] or []):
- if link.get('type') != "fulltext" or not link.get('url'):
+ for link in bibjson["link"] or []:
+ if link.get("type") != "fulltext" or not link.get("url"):
continue
- ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
+ ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
if not ingest_types:
continue
+
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in link['url'].lower():
+ if domain in link["url"].lower():
skip = True
if skip:
continue
try:
- base_url = canon(link['url'])
+ base_url = canon(link["url"].strip())
except UnicodeEncodeError:
continue
+ if not base_url or len(base_url) > 1000:
+ continue
+
for ingest_type in ingest_types:
request = {
- 'base_url': base_url,
- 'ingest_type': ingest_type,
- 'link_source': 'doaj',
- 'link_source_id': doaj_id,
- 'ingest_request_source': 'doaj',
- 'release_stage': 'published',
- 'rel': 'publisher',
- 'ext_ids': {
- 'doi': doi,
- 'doaj': doaj_id,
+ "base_url": base_url,
+ "ingest_type": ingest_type,
+ "link_source": "doaj",
+ "link_source_id": doaj_id,
+ "ingest_request_source": "doaj",
+ "release_stage": "published",
+ "rel": "publisher",
+ "ext_ids": {
+ "doi": doi,
+ "doaj": doaj_id,
},
- 'edit_extra': {},
+ "edit_extra": {},
}
requests.append(request)
return requests
+
def run(args) -> None:
for l in args.json_file:
if not l.strip():
@@ -123,17 +127,18 @@ def run(args) -> None:
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main() -> None:
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="DOAJ article dump file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 9fe1499..44c091c 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -17,29 +17,32 @@ And outputs JSON objects that are can be imported into fatcat with the
No dependencies (only python3 stdlib)
"""
-import sys
-import json
import base64
+import json
+import sys
+
def run():
for line in sys.stdin:
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 5
- raw_sha1 = line[0].replace('sha1:', '')
+ raw_sha1 = line[0].replace("sha1:", "")
dois = json.loads(line[1])
cdx = json.loads(line[2])
mimetype = line[3]
size = int(line[4])
- sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()
obj = dict(
sha1=sha1,
dois=dois,
- cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
size=size,
- mimetype=mimetype)
+ mimetype=mimetype,
+ )
print(json.dumps(obj))
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/fetch_cdx_sha1hex.py b/python/scripts/fetch_cdx_sha1hex.py
new file mode 100755
index 0000000..2eb56cb
--- /dev/null
+++ b/python/scripts/fetch_cdx_sha1hex.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+"""
+This is a helper script to take fatcat file entities with partial metadata (eg,
+missing SHA256) and try to find one or more CDX record where the file may be
+found in wayback.
+
+This script uses the sandcrawler library and should be run like:
+
+ head file_export.json | python -m scripts.fetch_cdx_sha1hex > results.json
+"""
+
+import base64
+import json
+import sys
+from typing import List, Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+
+from sandcrawler.ia import CdxApiClient, cdx_to_dict
+
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 3,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: requests.Session = None,
+) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+ return session
+
+
+def b32_hex(s: str) -> str:
+ """
+ Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+ base32 checksums are used by, eg, heritrix and in wayback CDX files
+ """
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ if len(s) == 40:
+ return s
+ raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+
+def get_db_cdx(sha1hex: str, http_session) -> List[dict]:
+ resp = http_session.get(
+ SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(sha1hex="eq." + sha1hex)
+ )
+ resp.raise_for_status()
+ rows = resp.json()
+ return rows or []
+
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+
+def get_api_cdx(url: str, sha1hex: str, cdx_api) -> Optional[dict]:
+
+ params = {
+ "url": url,
+ "output": "json",
+ "matchType": "exact",
+ "limit": 20,
+ # TODO: group-by digest/checksum?
+ # can't filter status because might be warc/revisit
+ # "filter": "statuscode:200",
+ }
+ rows = cdx_api._query_api(params)
+ if not rows:
+ return None
+ for row in rows:
+ if row.sha1hex == sha1hex:
+ return row
+ return None
+
+
+def process_file(fe, session, cdx_api) -> dict:
+ status = "unknown"
+
+ # simple CDX db lookup first
+ cdx_row_list = get_db_cdx(fe["sha1"], http_session=session)
+ if cdx_row_list:
+ return dict(
+ file_entity=fe,
+ cdx_rows=cdx_row_list,
+ status="success-db",
+ )
+
+ original_urls = []
+ for pair in fe["urls"]:
+ u = pair["url"]
+ if not "://web.archive.org/web/" in u:
+ continue
+ seg = u.split("/")
+ assert seg[2] == "web.archive.org"
+ assert seg[3] == "web"
+ if not seg[4].isdigit():
+ continue
+ original_url = "/".join(seg[5:])
+ original_urls.append(original_url)
+
+ if len(original_urls) == 0:
+ return dict(file_entity=fe, status="skip-no-urls")
+
+ found_cdx_rows = []
+ for url in list(set(original_urls)):
+
+ cdx_record = None
+ try:
+ cdx_record = get_api_cdx(original_url, sha1hex=fe["sha1"], cdx_api=cdx_api)
+ except requests.exceptions.HTTPError as e:
+ if e.response.status_code == 403:
+ return dict(file_entity=fe, status="fail-cdx-403")
+ else:
+ raise
+ if cdx_record and cdx_record.sha1hex == fe["sha1"]:
+ found_cdx_rows.append(cdx_to_dict(cdx_record))
+
+ if found_cdx_rows:
+ return dict(
+ file_entity=fe,
+ cdx_rows=found_cdx_rows,
+ status="success-api",
+ )
+
+ return dict(
+ file_entity=fe,
+ status="fail-not-found",
+ )
+
+
+def main():
+ session = requests_retry_session()
+ session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+ }
+ )
+ cdx_api = CdxApiClient()
+ for line in sys.stdin:
+ if not line.strip():
+ continue
+ fe = json.loads(line)
+ print(json.dumps(process_file(fe, session=session, cdx_api=cdx_api)))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index dc4bea7..8fce0d9 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,44 +1,49 @@
#!/usr/bin/env python3
-import sys
import json
+import sys
-with open('title_slug_denylist.txt', 'r') as f:
+with open("title_slug_denylist.txt", "r") as f:
TITLE_DENYLIST = [l.strip() for l in f]
-TITLE_DENYLIST.extend((
- 'editorial',
- 'advertisement',
- 'bookreviews',
- 'reviews',
- 'nr',
- 'abstractoriginalarticle',
- 'originalarticle',
- 'impactfactor',
- 'articlenumber',
-))
+TITLE_DENYLIST.extend(
+ (
+ "editorial",
+ "advertisement",
+ "bookreviews",
+ "reviews",
+ "nr",
+ "abstractoriginalarticle",
+ "originalarticle",
+ "impactfactor",
+ "articlenumber",
+ )
+)
# The full name can't *entirely* be one of these
NAME_DENYLIST = (
- 'phd',
- 'phdstudent',
+ "phd",
+ "phdstudent",
)
+
def tokenize(s, remove_whitespace=True):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+ return s.encode("ascii", "replace").decode("utf8").replace("?", "")
+
assert tokenize("Impact Factor: 2.114") == "impactfactor"
assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
def filter_title(title):
title = title.strip()
@@ -47,14 +52,14 @@ def filter_title(title):
title_slug = tokenize(title, remove_whitespace=True)
if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
return None
- if title_slug.startswith('nr'):
+ if title_slug.startswith("nr"):
return None
- if title.lower().replace('.', '').startswith('int j '):
+ if title.lower().replace(".", "").startswith("int j "):
return None
for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
if title.startswith(prefix):
- title.replace(prefix, '')
+ title.replace(prefix, "")
if title.startswith("The Journal of "):
return None
@@ -78,63 +83,84 @@ def filter_title(title):
return None
# too deep subtitling/splitting
- if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:
return None
return title
+
def filter_author_name(name):
- name = name['name']
- if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
+ name = name["name"]
+ if name.strip().lower().replace(" ", "") in NAME_DENYLIST:
return None
- return ' '.join([t for t in name.split() if tokenize(t)])
+ return " ".join([t for t in name.split() if tokenize(t)])
+
def filter_authors(l):
return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
def filter_refs(l):
# TODO:
return l
+
def filter_journal_name(name):
# same denylist, for now
if not name:
return None
- name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")
slug_name = tokenize(name)
if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
return None
- for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ for prefix in (
+ "/ ",
+ "~ ",
+ "& ",
+ "© ",
+ "Original Research Article ",
+ "Original Article ",
+ "Research Article ",
+ "Available online www.jocpr.com ",
+ ):
if name.startswith(prefix):
- name = name.replace(prefix, '')
- for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ name = name.replace(prefix, "")
+ for suffix in (
+ " Available online at www.sciarena.com",
+ " Original Article",
+ " Available online at",
+ " ISSN",
+ " ISSUE",
+ ):
if name.endswith(suffix):
- name = name.replace(suffix, '')
+ name = name.replace(suffix, "")
if "====================" in name:
return None
if len(name) > 150:
return None
- return ' '.join(name.split())
+ return " ".join(name.split())
+
def filter_metadata(obj):
- if not (obj.get('title') and obj.get('authors')):
+ if not (obj.get("title") and obj.get("authors")):
return None
- title = filter_title(obj['title'])
+ title = filter_title(obj["title"])
if not title:
- #sys.stderr.write("bad title\n")
+ # sys.stderr.write("bad title\n")
return None
else:
- obj['title'] = title
- obj['authors'] = filter_authors(obj['authors'])
- obj['citations'] = filter_refs(obj['citations'])
- obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+ obj["title"] = title
+ obj["authors"] = filter_authors(obj["authors"])
+ obj["citations"] = filter_refs(obj["citations"])
+ obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])
return obj
+
def run(invert=False):
for line in sys.stdin:
- fields = line.split('\t')
+ fields = line.split("\t")
if len(fields) == 5:
raw = fields[4]
elif len(fields) == 1:
@@ -151,9 +177,10 @@ def run(invert=False):
fields[4] = processed
else:
fields[0] = processed
- print('\t'.join(fields))
+ print("\t".join(fields))
elif invert:
print(raw.strip())
-if __name__=="__main__":
+
+if __name__ == "__main__":
run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
- dates differ (not just year)
"""
-import sys
import json
+import sys
# out of 1000
SCORE_THRESHOLD = 900
@@ -28,17 +28,19 @@ MAX_SLUG_LINES = 50
REQUIRE_AUTHORS = False
+
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -51,7 +53,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -59,20 +61,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, left, right)
def process_group(rows):
@@ -86,10 +90,10 @@ def process_group(rows):
left = json.loads(row[1])
right = json.loads(row[2])
# authors must roughly match
- if not check_authors(left['authors'], right['authors']):
+ if not check_authors(left["authors"], right["authors"]):
continue
# years must match (if defined)
- if left['year'] and right['year'] and left['year'] != right['year']:
+ if left["year"] and right["year"] and left["year"] != right["year"]:
continue
filtered.append((left, right))
@@ -101,8 +105,8 @@ def process_group(rows):
group_ids = set()
for row in filtered[1:]:
(left, right) = row
- l_id = left['fatcat_release']
- r_id = right['fatcat_release']
+ l_id = left["fatcat_release"]
+ r_id = right["fatcat_release"]
releases[l_id] = left
releases[r_id] = right
if not group_ids:
@@ -119,6 +123,7 @@ def process_group(rows):
print(json.dumps([releases[ident] for ident in group_ids]))
+
def run():
last_slug = None
@@ -126,7 +131,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -140,5 +145,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3654b87..c5b7eef 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).
No dependencies (only python3 stdlib)
"""
-import sys
import json
+import sys
# out of 1000
score_threshold = 900
@@ -23,15 +23,16 @@ require_authors = 1
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -44,7 +45,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -52,20 +53,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert not check_authors([], [])
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, grobid, crossref)
def process_group(rows):
@@ -78,20 +81,21 @@ def process_group(rows):
continue
grobid = json.loads(row[1])
crossref = json.loads(row[2])
- if not check_authors(crossref['authors'], grobid['authors']):
- #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+ if not check_authors(crossref["authors"], grobid["authors"]):
+ # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
continue
else:
- #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+ # print("YES: {} {}".format(crossref['authors'], grobid['authors']))
pass
- sha1 = grobid['sha1']
- doi = crossref['doi'].lower()
+ sha1 = grobid["sha1"]
+ doi = crossref["doi"].lower()
l = keepers.get(sha1, list())
l.append(doi)
keepers[sha1] = l
for sha1, doi_list in keepers.items():
print("{}\t{}".format(sha1, json.dumps(doi_list)))
+
def run():
last_slug = None
@@ -99,7 +103,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -112,5 +116,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 79feac1..90a0f77 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -10,43 +9,49 @@ Run in bulk like:
ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
"""
-import sys
import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
-from grobid2json import teixml2json
def parse_hbase(line):
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 2
sha1hex = line[0]
obj = json.loads(line[1])
- tei_xml = obj['tei_xml']
+ tei_xml = obj["tei_xml"]
return sha1hex, tei_xml
+
def parse_pg(line):
obj = json.loads(line)
- return obj['sha1hex'], obj['tei_xml']
+ return obj["sha1hex"], obj["tei_xml"]
+
-def run(mode='hbase'):
+def run(mode="hbase"):
for line in sys.stdin:
- if mode == 'hbase':
+ if mode == "hbase":
sha1hex, tei_xml = parse_hbase(line)
- elif mode == 'pg':
+ elif mode == "pg":
sha1hex, tei_xml = parse_pg(line)
else:
- raise NotImplementedError('parse mode: {}'.format(mode))
+ raise NotImplementedError("parse mode: {}".format(mode))
- obj = teixml2json(tei_xml, encumbered=False)
+ tei_doc = parse_document_xml(tei_xml)
+ tei_doc.remove_encumbered()
+ obj = tei_doc.to_legacy_dict()
affiliations = []
- for author in obj['authors']:
- if author.get('affiliation'):
- affiliations.append(author['affiliation'])
+ for author in obj["authors"]:
+ if author.get("affiliation"):
+ affiliations.append(author["affiliation"])
if affiliations:
# don't duplicate affiliations; only the unique ones
affiliations = list(set([json.dumps(a) for a in affiliations]))
affiliations = [json.loads(a) for a in affiliations]
- print('\t'.join([sha1hex, json.dumps(affiliations)]))
+ print("\t".join([sha1hex, json.dumps(affiliations)]))
+
-if __name__=='__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index 3d2e14c..f941881 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -1,69 +1,67 @@
#!/usr/bin/env python3
-import sys
-import json
import datetime
+import json
+import sys
+
+MAX_ABSTRACT_BYTES = 4096
-MAX_ABSTRACT_BYTES=4096
def parse_grobid_json(obj):
- if not obj.get('title'):
+ if not obj.get("title"):
return None
extra = dict()
- if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
- abobj = dict(
- mimetype="text/plain",
- language=None,
- content=obj.get('abstract').strip())
+ if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
+ abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
abstracts = [abobj]
else:
abstracts = None
contribs = []
- for a in obj.get('authors', []):
+ for a in obj.get("authors", []):
c = dict(raw_name=a, role="author")
contribs.append(c)
refs = []
- for raw in obj.get('citations', []):
+ for raw in obj.get("citations", []):
extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
- if raw.get('title'):
- ref['title'] = raw['title'].strip()
- if raw.get('date'):
+ ref["key"] = raw.get("id")
+ if raw.get("title"):
+ ref["title"] = raw["title"].strip()
+ if raw.get("date"):
try:
- year = int(raw['date'].strip()[:4])
- ref['year'] = year
+ year = int(raw["date"].strip()[:4])
+ ref["year"] = year
except:
pass
- for key in ('volume', 'url', 'issue', 'publisher'):
+ for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
extra[key] = raw[key].strip()
- if raw.get('authors'):
- extra['authors'] = [a['name'] for a in raw['authors']]
+ if raw.get("authors"):
+ extra["authors"] = [a["name"] for a in raw["authors"]]
if extra:
extra = dict(grobid=extra)
else:
extra = None
- ref['extra'] = extra
+ ref["extra"] = extra
refs.append(ref)
release_type = "journal-article"
release_date = None
- if obj.get('date'):
+ if obj.get("date"):
# TODO: only returns year, ever? how to handle?
- release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+ release_date = datetime.datetime(year=obj["date"], month=1, day=1)
- if obj.get('doi'):
- extra['doi'] = obj['doi']
- if obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
+ if obj.get("doi"):
+ extra["doi"] = obj["doi"].lower()
+ if obj["journal"].get("name"):
+ extra["container_name"] = obj["journal"]["name"]
- extra['is_longtail_oa'] = True
+ extra["is_longtail_oa"] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -73,15 +71,17 @@ def parse_grobid_json(obj):
extra = None
return dict(
- title=obj['title'].strip(),
+ title=obj["title"].strip(),
contribs=contribs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=obj["journal"].get("publisher"),
+ volume=obj["journal"].get("volume"),
+ issue=obj["journal"].get("issue"),
abstracts=abstracts,
release_type=release_type,
release_date=release_date,
- extra=extra)
+ extra=extra,
+ )
+
def run():
for line in sys.stdin:
@@ -90,5 +90,6 @@ def run():
if out:
print(out)
-if __name__=="__main__":
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 494ec7a..8a353ca 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
This script is used to turn ingest request postgres rows (in JSON export
format) back in to regular ingest request JSON.
@@ -7,24 +6,25 @@ format) back in to regular ingest request JSON.
The only difference is the name and location of some optional keys.
"""
-import sys
-import json
import argparse
+import json
+import sys
def transform(row):
"""
dict-to-dict
"""
- row.pop('created', None)
- extra = row.pop('request', None) or {}
- for k in ('ext_ids', 'edit_extra'):
+ row.pop("created", None)
+ extra = row.pop("request", None) or {}
+ for k in ("ext_ids", "edit_extra"):
if k in extra:
row[k] = extra[k]
- if 'release_ident' in extra:
- row['fatcat'] = dict(release_ident=extra['release_ident'])
+ if "release_ident" in extra:
+ row["fatcat"] = dict(release_ident=extra["release_ident"])
return row
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -33,19 +33,27 @@ def run(args):
req = transform(json.loads(l))
except:
print(l, file=sys.stderr)
+ if args.force_recrawl:
+ req["force_recrawl"] = True
print(json.dumps(req, sort_keys=True))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
+ )
+ parser.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="whether to add recrawl (SPNv2) flag to request",
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 35cee5b..24e22fd 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -10,9 +10,9 @@ This was used to convert this manifest:
to JSON format for fast fatcat importing.
"""
-import sys
import json
import sqlite3
+import sys
# iterate over rows in files metadata...
# 1. select all identified DOIs
@@ -20,6 +20,7 @@ import sqlite3
# 2. select all file metadata
# 3. output object
+
def or_none(s):
if s is None:
return None
@@ -27,6 +28,7 @@ def or_none(s):
return None
return s
+
def process_db(db_path):
db = sqlite3.connect(db_path)
@@ -52,5 +54,6 @@ def process_db(db_path):
dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
print(json.dumps(obj))
-if __name__=="__main__":
+
+if __name__ == "__main__":
process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..97c38f9 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,19 +1,18 @@
#!/usr/bin/env python3
-
"""
Transform an OAI-PMH bulk dump (JSON) into ingest requests.
Eg: https://archive.org/details/oai_harvest_20200215
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
@@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [
"://archive.org/",
".archive.org/",
"://127.0.0.1/",
-
+ "://www.kb.dk/",
+ "://kb-images.kb.dk/",
+ "://mdz-nbn-resolving.de/",
+ "://aggr.ukm.um.si/",
+ "://edoc.mpg.de/",
+ "doaj.org/",
+ "orcid.org/",
+ "://gateway.isiknowledge.com/",
# OAI specific additions
"://hdl.handle.net/",
]
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+ "oai:kb.dk:",
+ "oai:bdr.oai.bsb-muenchen.de:",
+ "oai:hispana.mcu.es:",
+ "oai:bnf.fr:",
+ "oai:ukm.si:",
+ "oai:biodiversitylibrary.org:",
+ "oai:hsp.org:",
+ "oai:repec:",
+ "oai:n/a:",
+ "oai:quod.lib.umich.edu:",
+ "oai:americanae.aecid.es:",
+ "oai:www.irgrid.ac.cn:",
+ "oai:espace.library.uq.edu:",
+ "oai:edoc.mpg.de:",
+ "oai:bibliotecadigital.jcyl.es:",
+ "oai:repository.erciyes.edu.tr:",
+ "oai:krm.or.kr:",
+ "oai:hypotheses.org:%",
+]
+
RELEASE_STAGE_MAP = {
- 'info:eu-repo/semantics/draftVersion': 'draft',
- 'info:eu-repo/semantics/submittedVersion': 'submitted',
- 'info:eu-repo/semantics/acceptedVersion': 'accepted',
- 'info:eu-repo/semantics/publishedVersion': 'published',
- 'info:eu-repo/semantics/updatedVersion': 'updated',
+ "info:eu-repo/semantics/draftVersion": "draft",
+ "info:eu-repo/semantics/submittedVersion": "submitted",
+ "info:eu-repo/semantics/acceptedVersion": "accepted",
+ "info:eu-repo/semantics/publishedVersion": "published",
+ "info:eu-repo/semantics/updatedVersion": "updated",
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -50,38 +80,43 @@ def transform(obj):
"""
requests = []
- if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+ if not obj.get("oai") or not obj["oai"].startswith("oai:"):
return []
- if not obj.get('urls'):
+ if not obj.get("urls"):
return []
+ oai_id = obj["oai"].lower()
+ for prefix in OAI_BLOCKLIST:
+ if oai_id.startswith(prefix):
+ return []
+
# look in obj['formats'] for PDF?
- if obj.get('formats'):
+ if obj.get("formats"):
# if there is a list of formats, and it does not contain PDF, then
# skip. Note that we will continue if there is no formats list.
has_pdf = False
- for f in obj['formats']:
- if 'pdf' in f.lower():
+ for f in obj["formats"]:
+ if "pdf" in f.lower():
has_pdf = True
if not has_pdf:
return []
doi = None
- if obj.get('doi'):
- doi = obj['doi'][0].lower().strip()
- if not doi.startswith('10.'):
+ if obj.get("doi"):
+ doi = obj["doi"][0].lower().strip()
+ if not doi.startswith("10."):
doi = None
# infer release stage and/or type from obj['types']
release_stage = None
- for t in obj.get('types', []):
+ for t in obj.get("types", []):
if t in RELEASE_STAGE_MAP:
release_stage = RELEASE_STAGE_MAP[t]
# TODO: infer rel somehow? Eg, repository vs. OJS publisher
rel = None
- for url in obj['urls']:
+ for url in obj["urls"]:
skip = False
for domain in DOMAIN_BLOCKLIST:
if domain in url:
@@ -94,23 +129,25 @@ def transform(obj):
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'oai',
- 'link_source_id': obj['oai'].lower(),
- 'ingest_request_source': 'metha-bulk',
- 'release_stage': release_stage,
- 'rel': rel,
- 'ext_ids': {
- 'doi': doi,
- 'oai': obj['oai'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "oai",
+ "link_source_id": oai_id,
+ "ingest_request_source": "metha-bulk",
+ "release_stage": release_stage,
+ "rel": rel,
+ "ext_ids": {
+ "oai": obj["oai"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
+ if doi:
+ request["ext_ids"]["doi"] = doi
requests.append(request)
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -121,17 +158,20 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file",
help="OAI-PMH dump file to use (usually stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index af08db6..8b57c5b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
@@ -7,6 +6,7 @@ Originally used to benchmark and compare file size/quality.
"""
import sys
+
import poppler
from PIL import Image
@@ -22,13 +22,16 @@ def run(inpath, outpath):
renderer = poppler.PageRenderer()
full_page = renderer.render_page(page)
- img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1)
- img.thumbnail((180,300), Image.BICUBIC)
- #img.thumbnail((360,600), Image.BICUBIC)
+ img = Image.frombuffer(
+ "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1
+ )
+ img.thumbnail((180, 300), Image.BICUBIC)
+ # img.thumbnail((360,600), Image.BICUBIC)
img.save(outpath)
- #img.save(outpath, quality=95)
+ # img.save(outpath, quality=95)
+
-if __name__ == '__main__':
+if __name__ == "__main__":
if len(sys.argv) != 3:
print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
sys.exit(-1)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 5536e6c..cb64a1a 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,41 +1,39 @@
#!/usr/bin/env python3
-
"""
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- "semanticscholar.org/",
"://doi.org/",
"zenodo.org/",
"figshare.com/",
- "://archive.org/",
- ".archive.org/",
]
RELEASE_STAGE_MAP = {
- 'draftVersion': 'draft',
- 'submittedVersion': 'submitted',
- 'acceptedVersion': 'accepted',
- 'publishedVersion': 'published',
- 'updatedVersion': 'updated',
+ "draftVersion": "draft",
+ "submittedVersion": "submitted",
+ "acceptedVersion": "accepted",
+ "publishedVersion": "published",
+ "updatedVersion": "updated",
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single unpaywall object to zero or more ingest requests.
@@ -43,48 +41,49 @@ def transform(obj):
"""
requests = []
- if not obj['doi'].startswith('10.'):
+ if not obj["doi"].startswith("10."):
return requests
- if not obj['oa_locations']:
+ if not obj["oa_locations"]:
return requests
- for location in obj['oa_locations']:
- if not location['url_for_pdf']:
+ for location in obj["oa_locations"]:
+ if not location["url_for_pdf"]:
continue
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in location['url_for_pdf']:
+ if domain in location["url_for_pdf"]:
skip = True
if skip:
continue
try:
- base_url = canon(location['url_for_pdf'])
+ base_url = canon(location["url_for_pdf"])
except UnicodeEncodeError:
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'unpaywall',
- 'link_source_id': obj['doi'].lower(),
- 'ingest_request_source': 'unpaywall',
- 'release_stage': RELEASE_STAGE_MAP.get(location['version']),
- 'rel': location['host_type'],
- 'ext_ids': {
- 'doi': obj['doi'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "unpaywall",
+ "link_source_id": obj["doi"].lower(),
+ "ingest_request_source": "unpaywall",
+ "release_stage": RELEASE_STAGE_MAP.get(location["version"]),
+ "rel": location["host_type"],
+ "ext_ids": {
+ "doi": obj["doi"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
- if obj.get('oa_status'):
- request['edit_extra']['oa_status'] = obj['oa_status']
- if location.get('evidence'):
- request['edit_extra']['evidence'] = location['evidence']
- if location['pmh_id']:
- request['ext_ids']['pmh_id'] = location['pmh_id']
+ if obj.get("oa_status"):
+ request["edit_extra"]["oa_status"] = obj["oa_status"]
+ if location.get("evidence"):
+ request["edit_extra"]["evidence"] = location["evidence"]
+ if location["pmh_id"]:
+ request["ext_ids"]["pmh_id"] = location["pmh_id"]
requests.append(request)
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -95,17 +94,18 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="unpaywall dump file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="unpaywall dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
new file mode 100644
index 0000000..54d07db
--- /dev/null
+++ b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T22:08:45Z","timestamp":1620684525878},"publisher-location":"Cham","reference-count":28,"publisher":"Springer International Publishing","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-64953-1_4","type":"book-chapter","created":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T02:57:20Z","timestamp":1610593040000},"page":"53-71","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mathematical Knowledge and Mathematical Objects"],"prefix":"10.1007","author":[{"given":"Lars-G\u00f6ran","family":"Johansson","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,1,14]]},"reference":[{"key":"4_CR12","doi-asserted-by":"publisher","volume-title":"Deflating existential consequence: A case for nominalism","author":"J Azzouni","year":"2004","unstructured":"Azzouni, J. (2004). Deflating existential consequence: A case for nominalism. New York: Oxford University Press.","DOI":"10.1093\/0195159888.001.0001"},{"key":"4_CR23","doi-asserted-by":"publisher","volume-title":"Foundations of constructive mathematics","author":"M Beeson","year":"1985","unstructured":"Beeson, M. (1985). Foundations of constructive mathematics. Berlin\/Heidelberg: Springer.","DOI":"10.1007\/978-3-642-68952-9"},{"issue":"2","key":"4_CR27","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1093\/philmat\/11.2.176","volume":"11","author":"H Billinge","year":"2003","unstructured":"Billinge, H. (2003). Did bishop have a philosophy of mathematics? Philosophica Mathematica, 11(2), 176\u2013194.","journal-title":"Philosophica Mathematica"},{"key":"4_CR29","doi-asserted-by":"publisher","volume-title":"Constructive analysis","author":"E Bishop","year":"1985","unstructured":"Bishop, E., & Bridges, D. S. (1985). Constructive analysis. Berlin: Springer.","DOI":"10.1007\/978-3-642-61667-9"},{"key":"4_CR37","series-title":"In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.)","volume-title":"Nominalism in the philosophy of mathematics","author":"O Bueno","year":"2014","unstructured":"Bueno, O. (2014). Nominalism in the philosophy of mathematics. In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.). Metaphysics Research Lab, Stanford University."},{"key":"4_CR38","volume-title":"Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen","author":"G Cantor","year":"1883","unstructured":"Cantor, G. (1883). Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen. Leipzig: Teubner."},{"key":"4_CR60","volume-title":"The seas of language","author":"M Dummett","year":"1993","unstructured":"Dummett, M. (1993). The seas of language. Oxford: Clarendon Press."},{"key":"4_CR73","volume-title":"In the light of logic","author":"S Feferman","year":"1998","unstructured":"Feferman, S. (1998). In the light of logic. New York: Oxford University Press."},{"key":"4_CR74","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1093\/0195148770.003.0019","volume-title":"The Oxford handbook of philosophy of mathematics and logic","author":"S Feferman","year":"2005","unstructured":"Feferman, S. (2005). Predicativity. In S. Shapiro (Ed.), The Oxford handbook of philosophy of mathematics and logic (pp. 590\u2013624). New York\/Oxford: Oxford University Press."},{"key":"4_CR77","volume-title":"Science without numbers: A defence of nominalism","author":"H H Field","year":"1980","unstructured":"Field, H. H. (1980). Science without numbers: A defence of nominalism. Oxford: Blackwell."},{"key":"4_CR88","volume-title":"Werke, volume 8","author":"C F Gauss","year":"2011","unstructured":"Gauss, C. F. (2011). Werke, volume 8. Cambridge: Cambridge University Press."},{"key":"4_CR93","unstructured":"Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155\u2013172). Bobs-Merrill company."},{"key":"4_CR103","volume-title":"Mathematics without numbers: Towards a modal-structural interpretation","author":"G Hellman","year":"1989","unstructured":"Hellman, G. (1989). Mathematics without numbers: Towards a modal-structural interpretation. Oxford: Clarendon Press."},{"key":"4_CR126","first-page":"201","volume-title":"Bertrand Russell. Philosopher of the century","author":"G Kreisel","year":"1967","unstructured":"Kreisel, G. (1967). Mathematical logic: What has it done for the philosophy of mathematics? In R. Shoenman (Ed.), Bertrand Russell. Philosopher of the century (pp. 201\u2013272). London: George Allen & Unwin."},{"key":"4_CR135","doi-asserted-by":"crossref","unstructured":"Lear, J. (1980). Aristotelian infinity. Proceedings of the Aristotelian Society, New Series, 80, 187\u2013210.","DOI":"10.1093\/aristotelian\/80.1.187"},{"key":"4_CR175","doi-asserted-by":"publisher","first-page":"63","DOI":"10.12775\/LLP.1998.004","volume":"6","author":"F Pataut","year":"1998","unstructured":"Pataut, F. (1998). Incompleteness, constructivism and truth. Logic and Logical Philosophy, 6, 63\u201376.","journal-title":"Logic and Logical Philosophy"},{"key":"4_CR180","first-page":"294","volume":"14","author":"H Poincar\u00e9","year":"1906","unstructured":"Poincar\u00e9, H. (1906). Les math\u00e9matiques et la logique. Revue de m\u00e9taphysique et de morale, 14, 294\u2013317.","journal-title":"Revue de m\u00e9taphysique et de morale"},{"key":"4_CR190","volume-title":"Word and object","author":"W V O Quine","year":"1960","unstructured":"Quine, W. V. O. (1960). Word and object. Cambridge, MA: MIT Press."},{"key":"4_CR193","unstructured":"Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133\u2013136). Cambridge, MA: Harvard University Press."},{"key":"4_CR197","first-page":"31","volume-title":"Theories and things","author":"W V O Quine","year":"1981","unstructured":"Quine, W. V. O. (1981c). What price bivalence? In Theories and things (pp. 31\u201337). Cambridge, MA: The Belknap Press of Harvard University Press."},{"issue":"1","key":"4_CR198","doi-asserted-by":"publisher","first-page":"5","DOI":"10.2307\/2026889","volume":"89","author":"WV O Quine","year":"1992","unstructured":"Quine, W.V. O. (1992). Structure and nature. The Journal of Philosophy, 89(1), 5\u20139.","journal-title":"The Journal of Philosophy"},{"key":"4_CR199","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1080\/014453401625669","volume":"25","author":"P Raatikainen","year":"2004","unstructured":"Raatikainen, P. (2004). Conceptions of truth in intuitionism. History and Philosophy of Logic, 25, 131\u2013145.","journal-title":"History and Philosophy of Logic"},{"key":"4_CR210","unstructured":"Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29\u201353."},{"key":"4_CR212","volume-title":"Introduction to mathematical philosophy","author":"B Russell","year":"1919","unstructured":"Russell, B. (1919). Introduction to mathematical philosophy. London: Routledge."},{"key":"4_CR222","doi-asserted-by":"crossref","unstructured":"Schwarz, J. T. (2006(1966)). The pernicious influence of mathematics on science. In R. Hersch (Ed.), 18 unconventional essays on the nature of mathematics (Chap. 13, pp. 231\u2013235). New York: Springer.","DOI":"10.1007\/0-387-29831-2_13"},{"key":"4_CR233","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1007\/BF00247187","volume":"12","author":"G Sundholm","year":"1983","unstructured":"Sundholm, G. (1983). Constructions, proofs and the meaning of logical constants. Journal of Philosophical Logic, 12, 151\u2013172.","journal-title":"Journal of Philosophical Logic"},{"issue":"2","key":"4_CR235","doi-asserted-by":"publisher","first-page":"101","DOI":"10.1007\/s10701-007-9186-9","volume":"38","author":"M Tegmark","year":"2008","unstructured":"Tegmark, M. (2008). The mathematical universe. Foundations of Physics, 38(2), 101\u2013150.","journal-title":"Foundations of Physics"},{"key":"4_CR262","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1016\/0010-0277(90)90003-3","volume":"36","author":"K Wynn","year":"1990","unstructured":"Wynn, K. (1990). Children\u2019s understanding of counting. Cognition, 36, 155\u2013193.","journal-title":"Cognition"}],"container-title":["Synthese Library","Empiricism and Philosophy of Physics"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-64953-1_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T03:00:39Z","timestamp":1610593239000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":28,"URL":"http:\/\/dx.doi.org\/10.1007\/978-3-030-64953-1_4","relation":{},"ISSN":["0166-6991","2542-8292"],"issn-type":[{"value":"0166-6991","type":"print"},{"value":"2542-8292","type":"electronic"}],"published":{"date-parts":[[2021]]},"assertion":[{"value":"14 January 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}} \ No newline at end of file
diff --git a/python/tests/files/crossref_api_work_s1047951103000064.json b/python/tests/files/crossref_api_work_s1047951103000064.json
new file mode 100644
index 0000000..dfb795d
--- /dev/null
+++ b/python/tests/files/crossref_api_work_s1047951103000064.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,6,10]],"date-time":"2021-06-10T05:35:02Z","timestamp":1623303302043},"reference-count":46,"publisher":"Cambridge University Press (CUP)","issue":"1","license":[{"start":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T00:00:00Z","timestamp":1113782400000},"content-version":"unspecified","delay-in-days":807,"URL":"https:\/\/www.cambridge.org\/core\/terms"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Cardiol Young"],"published-print":{"date-parts":[[2003,2]]},"abstract":"<jats:p>We designed a multi-hospital prospective study of children less than 12 years to determine the comparative clinical profile, severity of carditis, and outcome on follow up of patients suffering an initial and recurrent episodes of acute rheumatic fever. The study extended over a period of 3 years, with diagnosis based on the Jones criteria. We included 161 children in the study, 57 having only one episode and 104 with recurrent episodes. Those seen in the first episode were differentiated from those with recurrent episodes on the basis of the history. The severity of carditis was graded by clinical and echocardiographic means. In those suffering their first episode, carditis was significantly less frequent (61.4%) compared to those having recurrent episodes (96.2%). Arthritis was more marked in the first episode (61.4%) compared to recurrent episodes (36.5%). Chorea was also significantly higher in the first episode (15.8%) compared to recurrent episodes (3.8%). Sub-cutaneous nodules were more-or-less the same in those suffering the first (7%) as opposed to recurrent episodes (5.8%), but Erythema marginatum was more marked during the first episode (3.5%), being rare in recurrent episodes at 0.9%. Fever was recorded in approximately the same numbers in first (45.6%) and recurrent episodes (48.1%). Arthralgia, in contrast, was less frequent in first (21.1%) compared to recurrent episodes (32.7%). A history of sore throat was significantly increased amongst those suffering the first episode (54.4%) compared to recurrent episodes (21.2%). When we compared the severity of carditis in the first versus recurrent episodes, at the start of study mild carditis was found in 29.8% versus 10.6%, moderate carditis in 26.3% versus 53.8%, and severe carditis in 5.3% versus 31.8% of cases, respectively. At the end of study, 30.3% of patients suffering their first episode were completely cured of carditis, and all others showed significant improvement compared to those with recurrent episodes, where only 6.8% were cured, little improvement or deterioration being noted in the remainder of the patients. We conclude that the clinical profile of acute rheumatic fever, especially that of carditis, is milder in those suffering their first attack compared to those with recurrent episodes.<\/jats:p>","DOI":"10.1017\/s1047951103000064","type":"journal-article","created":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T11:49:54Z","timestamp":1113824994000},"page":"28-35","source":"Crossref","is-referenced-by-count":11,"title":["Clinical profile of acute rheumatic fever in Pakistan"],"prefix":"10.1017","volume":"13","author":[{"given":"Hasina Suleman","family":"Chagani","sequence":"first","affiliation":[]},{"given":"Kalimuddin","family":"Aziz","sequence":"additional","affiliation":[]}],"member":"56","published-online":{"date-parts":[[2005,4,18]]},"reference":[{"key":"S1047951103000064_ref010","doi-asserted-by":"crossref","unstructured":"Alan L , Bisno . Group A streptococcal infection and acute rheumatic fever. N Engl J Med 1991; 325: 783\u2013793.","DOI":"10.1056\/NEJM199109123251106"},{"key":"S1047951103000064_ref036","doi-asserted-by":"crossref","unstructured":"Abbasi AS , Hashmi JA , Robinson RD , Suraya S , Syed SA . Prevalence of heart disease in school children of Karachi. Am J Cardiol 1966; 18: 544\u2013547.","DOI":"10.1016\/0002-9149(66)90008-7"},{"key":"S1047951103000064_ref025","unstructured":"Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285\u2013294."},{"key":"S1047951103000064_ref013","unstructured":"Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185\u2013192."},{"key":"S1047951103000064_ref007","doi-asserted-by":"crossref","unstructured":"Okoroma EO , Ihenacho HNC , Anyanwu CH . Rheumatic fever in Nigerian children. A prospective study of 66 patients. Am J Dis Child 1981; 35: 236\u2013238.","DOI":"10.1001\/archpedi.1981.02130270028010"},{"key":"S1047951103000064_ref031","doi-asserted-by":"crossref","unstructured":"Gordis L . Effectiveness of comprehensive care program in preventing rheumatic fever. N Engl J Med 1973; 289: 331\u2013335.","DOI":"10.1056\/NEJM197308162890701"},{"key":"S1047951103000064_ref012","unstructured":"Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21\u201324."},{"key":"S1047951103000064_ref026","doi-asserted-by":"crossref","unstructured":"Reale A , Colella C , Bruno AM . Mitral stenosis in childhood: Clinical and therapeutic aspects. Am Heart J 1963; 66: 15.","DOI":"10.1016\/0002-8703(63)90064-4"},{"key":"S1047951103000064_ref046","doi-asserted-by":"crossref","unstructured":"Aziz KU , Cheema L , Memon AD . Long-term observations of rheumatic carditis. Cardiol Young 1992; 2: 254\u2013260.","DOI":"10.1017\/S1047951100001001"},{"key":"S1047951103000064_ref041","unstructured":"Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300\u2013305."},{"key":"S1047951103000064_ref002","unstructured":"Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889."},{"key":"S1047951103000064_ref043","unstructured":"Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336\u2013345."},{"key":"S1047951103000064_ref037","unstructured":"Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2\u20136."},{"key":"S1047951103000064_ref029","doi-asserted-by":"crossref","unstructured":"Hassel TA , Stuart KL . Rheumatic fever prophylaxis. A three-year study. Br Med J 1972; 2: 39\u201340.","DOI":"10.1136\/bmj.2.5909.39"},{"key":"S1047951103000064_ref024","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Berry AM , Duggal S , Hooja V , Ghosh S . Sequel of initial attack of acute rheumatic fever. A prospective 5-year follow-up study. Circulation 1982; 65: 375\u2013379.","DOI":"10.1161\/01.CIR.65.2.375"},{"key":"S1047951103000064_ref022","doi-asserted-by":"crossref","unstructured":"Brownell KD , Rese FB . Acute rheumatic fever in children. Incidence in Borough of New York city. JAMA. 1973; 224: 1593\u20131597.","DOI":"10.1001\/jama.1973.03220260015004"},{"key":"S1047951103000064_ref035","unstructured":"Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071\u20131081."},{"key":"S1047951103000064_ref003","unstructured":"El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980's. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183\u2013203."},{"key":"S1047951103000064_ref045","doi-asserted-by":"crossref","unstructured":"Markowitz M . Eradication of rheumatic fever. An unfulfilled hope. Circulation 1970; 41: 1077\u20131084.","DOI":"10.1161\/01.CIR.41.6.1077"},{"key":"S1047951103000064_ref005","unstructured":"Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886."},{"key":"S1047951103000064_ref017","unstructured":"Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483\u2013494."},{"key":"S1047951103000064_ref028","doi-asserted-by":"crossref","unstructured":"Ehmke DA , Stehbens JA , Young L . Two studies of compliance with daily prophylaxis in rheumatic fever patients in Iowa. Am J Public Health 1980; 70: 1189\u20131193.","DOI":"10.2105\/AJPH.70.11.1189"},{"key":"S1047951103000064_ref021","doi-asserted-by":"crossref","unstructured":"Ward C . The reappraisal of the clinical features in acute and chronic rheumatic heart disease. Etiology implications. Am Heart J 1979; 98: 298\u2013306.","DOI":"10.1016\/0002-8703(79)90040-1"},{"key":"S1047951103000064_ref009","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Thaper MK , Ahmed SA , Hooja V , Tewari P . The initial attack of acute rheumatic fever during childhood in North India. A prospective study of the clinical profile. Circulation 1974; 49: 7\u201312.","DOI":"10.1161\/01.CIR.49.1.7"},{"key":"S1047951103000064_ref016","unstructured":"Strasser T . Rheumatic fever and rheumatic heart disease in the 1970's. WHO Chron. 1978; 32: 18\u201325."},{"key":"S1047951103000064_ref019","doi-asserted-by":"crossref","unstructured":"Bland EF , Jones TD . Rheumatic fever and rheumatic heart disease. A twenty-year report on 1000 patients followed since childhood. Circulation 1951; 4: 836\u2013843.","DOI":"10.1161\/01.CIR.4.6.836"},{"key":"S1047951103000064_ref042","doi-asserted-by":"crossref","unstructured":"Wood HF , McCarty M . Laboratory aids in the diagnosis of rheumatic fever and evaluation of disease activity. Am J Med 1954; 17: 768\u2013774.","DOI":"10.1016\/0002-9343(54)90221-1"},{"key":"S1047951103000064_ref020","doi-asserted-by":"crossref","unstructured":"Baldwin JS , Kerr JM , Kuttner AG , Doyle EF . Observation in rheumatic nodules over 30 years period. J Pediatr 1960; 56: 465\u2013470.","DOI":"10.1016\/S0022-3476(60)80358-7"},{"key":"S1047951103000064_ref004","doi-asserted-by":"crossref","unstructured":"Majeed HA , Khan N , Dabbagh M , Naidi K . Acute rheumatic fever during childhood in Kuwait: The mild nature of initial attack. Ann Trop Paediatr 1981; 1: 13\u201320.","DOI":"10.1080\/02724936.1981.11748053"},{"key":"S1047951103000064_ref001","unstructured":"Brittanica: Book of year 1991. Chicago, 1991."},{"key":"S1047951103000064_ref039","unstructured":"Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990."},{"key":"S1047951103000064_ref040","doi-asserted-by":"crossref","unstructured":"Taranta A , Markowitz M . Rheumatic fever. A guide to its recognition, prevention and cure, with special reference to developing countries. M.T.P. Press Ltd., Boston, 1981.","DOI":"10.1007\/978-94-015-7171-5"},{"key":"S1047951103000064_ref032","unstructured":"Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1\u201315."},{"key":"S1047951103000064_ref014","unstructured":"Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2\u20139."},{"key":"S1047951103000064_ref011","doi-asserted-by":"crossref","unstructured":"Gharib R . Acute rheumatic fever in Shiraz, Iran. It's prevalence and characteristics in two socio-economic groups. Am J Dis Child 1969: 118: 694\u2013699.","DOI":"10.1001\/archpedi.1969.02100040696005"},{"key":"S1047951103000064_ref008","unstructured":"Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543\u2013550."},{"key":"S1047951103000064_ref033","doi-asserted-by":"crossref","unstructured":"Spagnuolo M , Pasternack B , Taranta A . Risk of rheumatic fever recurrences after streptococcal infections. Prospective study of clinical and social factors. N Engl J Med 1971; 285: 641\u2013647.","DOI":"10.1056\/NEJM197109162851201"},{"key":"S1047951103000064_ref038","unstructured":"Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539\u2013549."},{"key":"S1047951103000064_ref023","doi-asserted-by":"crossref","unstructured":"Feinstein AR , Spagnuolo M . The clinical patterns of acute rheumatic fever; A reappraisal. Medicine 1962; 41: 279\u2013305.","DOI":"10.1097\/00005792-196212000-00001"},{"key":"S1047951103000064_ref018","unstructured":"Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501\u20131515."},{"key":"S1047951103000064_ref027","unstructured":"Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8\u201314."},{"key":"S1047951103000064_ref034","unstructured":"Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14\u201316."},{"key":"S1047951103000064_ref044","unstructured":"Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389\u2013395."},{"key":"S1047951103000064_ref006","unstructured":"Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849\u2013853."},{"key":"S1047951103000064_ref030","unstructured":"Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599\u2013603."},{"key":"S1047951103000064_ref015","doi-asserted-by":"crossref","unstructured":"Robinson RD , Sultana S , Abbasi AS et al. Acute rheumatic fever in Karachi, Pakistan. Am J Cardiol 1966; 8: 548\u2013551.","DOI":"10.1016\/0002-9149(66)90009-9"}],"container-title":["Cardiology in the Young"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.cambridge.org\/core\/services\/aop-cambridge-core\/content\/view\/S1047951103000064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,4,6]],"date-time":"2020-04-06T22:32:57Z","timestamp":1586212377000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2003,2]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2003,2]]}},"alternative-id":["S1047951103000064"],"URL":"http:\/\/dx.doi.org\/10.1017\/s1047951103000064","relation":{},"ISSN":["1047-9511","1467-1107"],"issn-type":[{"value":"1047-9511","type":"print"},{"value":"1467-1107","type":"electronic"}],"subject":["Cardiology and Cardiovascular Medicine","General Medicine","Pediatrics, Perinatology, and Child Health"],"published":{"date-parts":[[2003,2]]}}} \ No newline at end of file
diff --git a/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
new file mode 100644
index 0000000..b47f85b
--- /dev/null
+++ b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
@@ -0,0 +1,66 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">A world of individuals</title>
+ <author>
+ <persName><forename type="first">N</forename><surname>Goodman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Problems and projects</title>
+ <imprint>
+ <date type="published" when="1972">1972</date>
+ <biblScope unit="page" from="155" to="172" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Implicit definition sustained</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">V O</forename><surname>Quine</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">The ways of paradox and other essays</title>
+ <meeting><address><addrLine>Cambridge, MA</addrLine></address></meeting>
+ <imprint>
+ <publisher>Harvard University Press</publisher>
+ <date type="published" when="1976">1976b</date>
+ <biblScope unit="page" from="133" to="136" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133–136). Cambridge, MA: Harvard University Press.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <monogr>
+ <title level="m" type="main">On some difficulties in the theory of transfinite numbers and order types</title>
+ <author>
+ <persName><forename type="first">B</forename><surname>Russell</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1906">1906</date>
+ <publisher>Proceedings of London Mathematical Society</publisher>
+ <biblScope unit="volume">4</biblScope>
+ <biblScope unit="page" from="29" to="53" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29–53.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/grobid_refs_s1047951103000064.tei.xml b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
new file mode 100644
index 0000000..e0eae8a
--- /dev/null
+++ b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
@@ -0,0 +1,499 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">The community control of rheumatic fever and rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">N</forename><surname>Dondong</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Elkholy</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="285" to="294" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note>Report of a WHO international co-operative project</note>
+ <note type="raw_reference">Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285–294.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Rehman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="page" from="185" to="192" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185–192.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever in Sudanese children</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Ismail</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>El Amin</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Arab J Med</title>
+ <imprint>
+ <biblScope unit="volume">2</biblScope>
+ <biblScope unit="page" from="21" to="24" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21–24.</note>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+ <analytic>
+ <title level="a" type="main">Incidence of heart disease in children at NICVD</title>
+ <author>
+ <persName><forename type="first">K</forename><forename type="middle">U</forename><surname>Aziz</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="300" to="305" />
+ <date type="published" when="1984">1984</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300–305.</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+ <monogr>
+ <title level="m" type="main">The various manifestations of rheumatic fever as exemplified in childhood and early life</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">B</forename><surname>Cheadle</surname></persName>
+ </author>
+ <imprint>
+ <publisher>Smith and Co</publisher>
+ <biblScope unit="page">1889</biblScope>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889.</note>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-I. A major public health problem</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="336" to="345" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336–345.</note>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+ <analytic>
+ <title level="a" type="main">Prevalence of heart disease in school children of Islamabad</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Malik</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Jaffrey</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Ahmed</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">Zubeda</forename><surname>Khanum</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pakistan Heart Journal</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <biblScope unit="page" from="2" to="6" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2–6.</note>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease and overcrowding</title>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Watkins</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">P</forename><surname>Quinn</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Am J Public Health</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="page" from="1071" to="1081" />
+ <date type="published" when="1948">1948</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071–1081.</note>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+ <analytic>
+ <title level="a" type="main">The spectrum and specter of rheumatic fever in 1980&apos;s</title>
+ <author>
+ <persName><forename type="first">W</forename><surname>El-Sadr</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Taranta</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Clinical Immunology Up-Date. Edited by Franklin EC</title>
+ <imprint>
+ <biblScope unit="page" from="183" to="203" />
+ <date type="published" when="1979">1979</date>
+ <publisher>Elsevier</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980&apos;s. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183–203.</note>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+ <monogr>
+ <title level="m" type="main">Tonsillitis in adolescent, Bailliere Tendoll and Cox</title>
+ <author>
+ <persName><forename type="first">C</forename><surname>Haig-Brown</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1886">1886</date>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886.</note>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+ <analytic>
+ <title level="a" type="main">Studies on the transmission within the families of group A hemolytic streptococci</title>
+ <author>
+ <persName><forename type="first">L</forename><forename type="middle">I</forename><surname>Levine</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Chapman</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Guerra</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><surname>Cooper</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Krause</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">J Lab Clin Med</title>
+ <imprint>
+ <biblScope unit="volume">67</biblScope>
+ <biblScope unit="page" from="483" to="494" />
+ <date type="published" when="1966">1966</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483–494.</note>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+ <monogr>
+ <title level="m" type="main">Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="volume">32</biblScope>
+ <biblScope unit="page" from="18" to="25" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Strasser T . Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron. 1978; 32: 18–25.</note>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+ <monogr>
+ <title level="m" type="main">Brittanica: Book of year 1991</title>
+ <imprint>
+ <date type="published" when="1991">1991</date>
+ <publisher>Chicago</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Brittanica: Book of year 1991. Chicago, 1991.</note>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+ <monogr>
+ <title level="m" type="main">Pockets of rheumatic fever in developed world. XI World Congress of Cardiology</title>
+ <author>
+ <persName><forename type="first">R</forename><surname>Talbot</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1990">1990</date>
+ <pubPlace>Manila</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990.</note>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+ <analytic>
+ <title level="a" type="main">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease</title>
+ </analytic>
+ <monogr>
+ <title level="j">Circulation</title>
+ <imprint>
+ <biblScope unit="volume">41</biblScope>
+ <biblScope unit="page" from="A1" to="15" />
+ <date type="published" when="1970">1970</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1–15.</note>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever and rheumatic carditis in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Shafqat</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Ramzan</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="page" from="2" to="9" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2–9.</note>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in developing countries</title>
+ <author>
+ <persName><forename type="first">S</forename><surname>Padmavati</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">56</biblScope>
+ <biblScope unit="page" from="543" to="550" />
+ <date type="published" when="1979">1979</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543–550.</note>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+ <analytic>
+ <title level="a" type="main">Streptococcal infections in families. Factors altering individual susceptibility</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Meyer</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Haggerty</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pediatrics</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="539" to="549" />
+ <date type="published" when="1962">1962</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539–549.</note>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+ <analytic>
+ <title level="a" type="main">Collagen and connective tissue diseases</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Shanks</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Textbook of Pediatrics</title>
+ <editor>
+ <persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Forfar</surname></persName>
+ <persName><forename type="first">C</forename><forename type="middle">C</forename><surname>Arneil</surname></persName>
+ </editor>
+ <meeting><address><addrLine>Edinburgh</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="page" from="1501" to="1515" />
+ </imprint>
+ <respStmt>
+ <orgName>Churchill Livingstone</orgName>
+ </respStmt>
+ </monogr>
+ <note type="raw_reference">Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501–1515.</note>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+ <analytic>
+ <title level="a" type="main">Prophylaxis against recurrence of rheumatic fever</title>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Billoo</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">S</forename><surname>Abbasi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Sultana</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">L</forename><surname>Desa</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">1</biblScope>
+ <biblScope unit="page" from="8" to="14" />
+ <date type="published" when="1968">1968</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8–14.</note>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">5</biblScope>
+ <biblScope unit="page" from="14" to="16" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14–16.</note>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="389" to="395" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389–395.</note>
+</biblStruct>
+
+<biblStruct xml:id="b22">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever: Clinical profile of 339 cases with long term follow-up</title>
+ <author>
+ <persName><forename type="first">M</forename><forename type="middle">K</forename><surname>Joshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">P</forename><forename type="middle">W</forename><surname>Kandoth</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Barve</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Kamat</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Indian pediatr</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="849" to="853" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849–853.</note>
+</biblStruct>
+
+<biblStruct xml:id="b23">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in rural south Indian children</title>
+ <author>
+ <persName><forename type="first">G</forename><surname>Koshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Benjamin</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">G</forename><surname>Cherian</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="599" to="603" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599–603.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/small.json b/python/tests/files/small.json
index 3f84ea4..3839c99 100644
--- a/python/tests/files/small.json
+++ b/python/tests/files/small.json
@@ -27,21 +27,16 @@
"date": "2001",
"id": "b0",
"index": 0,
- "issue": null,
"journal": "Letters in the Alphabet",
- "publisher": null,
+ "pages": "1-11",
"title": "Everything is Wonderful",
- "url": null,
"volume": "20"},
{ "authors": [],
"date": "2011-03-28",
"id": "b1",
"index": 1,
- "issue": null,
"journal": "The Dictionary",
- "publisher": null,
"title": "All about Facts",
- "url": null,
"volume": "14"}
],
"abstract": "Everything you ever wanted to know about nothing",
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 36d90ef..dce64bc 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,17 +1,18 @@
+import json
+import struct
import pytest
-import struct
import responses
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient
-from test_wayback import wayback_client, cdx_client
-
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
-with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
+with open("tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml", "rb") as f:
REAL_TEI_XML = f.read()
+
@pytest.fixture
def grobid_client():
client = GrobidClient(
@@ -19,61 +20,203 @@ def grobid_client():
)
return client
+
@responses.activate
def test_grobid_503(grobid_client):
status = b'{"status": "done broke due to 503"}'
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=503,
- body=status)
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=503,
+ body=status,
+ )
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 503
- assert resp['status'] == "error"
+ assert resp["status_code"] == 503
+ assert resp["status"] == "error"
+
+
+@responses.activate
+def test_grobid_success_iso_8859(grobid_client):
+ """
+ This might have been the old GROBID behavior, with default encoding? Can't really remember.
+ """
+
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
+
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ # print(type(resp['tei_xml']))
+ # print(type(REAL_TEI_XML))
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("ISO-8859-1")
+
@responses.activate
def test_grobid_success(grobid_client):
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="application/xml; charset=UTF-8",
+ )
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 200
- assert resp['status'] == "success"
- #print(type(resp['tei_xml']))
- #print(type(REAL_TEI_XML))
- assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1')
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8")
+
@responses.activate
-def test_grobid_worker_cdx(grobid_client, wayback_client):
+def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
sink = BlackholeSink()
worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
+ assert len(responses.calls) == worker.counts["total"]
+
+
+@responses.activate
+def test_grobid_refs_978(grobid_client):
+
+ with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f:
+ xml_bytes = f.read()
+ assert "\u2013".encode("utf-8") in xml_bytes
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=xml_bytes,
+ content_type="application/xml; charset=UTF-8",
+ )
- assert len(responses.calls) == worker.counts['total']
+ refs_row = grobid_client.crossref_refs(crossref_work)
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 3
+ assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"])
+
+ # test case of no references
+ crossref_work["message"]["reference"] = []
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # test that 'message' works also
+ refs_row = grobid_client.crossref_refs(crossref_work["message"])
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # grobid gets no additional POST from the above empty queries
+ assert len(responses.calls) == 1
+
+
+@responses.activate
+def test_grobid_refs_s104(grobid_client):
+
+ # test another file
+ with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f:
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=f.read(),
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # GROBID gets one more POST
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1017/s1047951103000064"
+ assert refs_row["source_ts"] == "2021-06-10T05:35:02Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 24
+ assert set([r["id"] for r in refs]) == set(
+ [
+ "S1047951103000064_ref025",
+ "S1047951103000064_ref013",
+ "S1047951103000064_ref012",
+ "S1047951103000064_ref041",
+ "S1047951103000064_ref002",
+ "S1047951103000064_ref043",
+ "S1047951103000064_ref037",
+ "S1047951103000064_ref035",
+ "S1047951103000064_ref003",
+ "S1047951103000064_ref005",
+ "S1047951103000064_ref017",
+ "S1047951103000064_ref016",
+ "S1047951103000064_ref001",
+ "S1047951103000064_ref039",
+ "S1047951103000064_ref032",
+ "S1047951103000064_ref014",
+ "S1047951103000064_ref008",
+ "S1047951103000064_ref038",
+ "S1047951103000064_ref018",
+ "S1047951103000064_ref027",
+ "S1047951103000064_ref034",
+ "S1047951103000064_ref044",
+ "S1047951103000064_ref006",
+ "S1047951103000064_ref030",
+ ]
+ )
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index 8497b10..b00a88d 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -1,22 +1,28 @@
-
-import xml
import json
+import xml
+
import pytest
-from grobid2json import *
+from grobid_tei_xml import parse_document_xml
def test_small_xml():
-
- with open('tests/files/small.xml', 'r') as f:
+ """
+ This used to be a test of grobid2json; now it is a compatability test for
+ the to_legacy_dict() feature of grobid_tei_xml.
+ """
+
+ with open("tests/files/small.xml", "r") as f:
tei_xml = f.read()
- with open('tests/files/small.json', 'r') as f:
- json_form = json.loads(f.read())
+ with open("tests/files/small.json", "r") as f:
+ json_form = json.loads(f.read())
+
+ tei_doc = parse_document_xml(tei_xml)
+ assert tei_doc.to_legacy_dict() == json_form
- assert teixml2json(tei_xml) == json_form
def test_invalid_xml():
with pytest.raises(xml.etree.ElementTree.ParseError):
- teixml2json("this is not XML")
+ parse_document_xml("this is not XML")
with pytest.raises(ValueError):
- teixml2json("<xml></xml>")
+ parse_document_xml("<xml></xml>")
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 9a81852..043c63d 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,33 +1,7 @@
-
-import json
-import pytest
-import responses
-
from sandcrawler.html import extract_fulltext_url
+
def test_extract_fulltext_url():
resp = extract_fulltext_url("asdf", b"asdf")
assert resp == {}
-
- resp = extract_fulltext_url(
- "http://dummy-site/",
- b"""<html>
- <head>
- <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
- </head>
- <body>
- <h1>my big article here</h1>
- blah
- </body>
- </html>"""
- )
- assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf"
- assert resp['technique'] == "citation_pdf_url"
-
- with open('tests/files/plos_one_article.html', 'rb') as f:
- resp = extract_fulltext_url(
- "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
- f.read(),
- )
- assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index e6e48ac..ba4acf1 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,14 +1,10 @@
-
-import datetime
-import pytest
-
-from sandcrawler.html_ingest import *
+from sandcrawler.ingest_html import *
def test_html_extract_ojs3() -> None:
- with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f:
+ with open("tests/files/first_monday_ojs3_fulltext.html", "rb") as f:
ojs3_html = f.read()
fulltext = html_extract_body_teixml(ojs3_html)
- assert fulltext['status'] == 'success'
+ assert fulltext["status"] == "success"
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index bf26a98..69bd211 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -1,5 +1,5 @@
-
import datetime
+
import pytest
from sandcrawler.html_metadata import *
@@ -7,14 +7,20 @@ from sandcrawler.html_metadata import *
def test_html_metadata_plos() -> None:
- with open('tests/files/plos_one_article.html', 'r') as f:
+ with open("tests/files/plos_one_article.html", "r") as f:
plos_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
assert meta is not None
- assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ assert (
+ meta.title
+ == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ )
assert meta.doi == "10.1371/journal.pone.0213978"
- assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
assert meta.contrib_names == [
"Yang Li",
"Tuanjie Wang",
@@ -37,17 +43,26 @@ def test_html_metadata_plos() -> None:
assert meta.volume == "14"
assert meta.container_issn == "1932-6203"
assert meta.publisher == "Public Library of Science"
- assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
+ assert (
+ meta.raw_references
+ and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;"
+ in meta.raw_references
+ )
assert meta.release_type == "article-journal"
- assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
def test_html_metadata_elife() -> None:
-
- with open('tests/files/elife_article.html', 'r') as f:
+
+ with open("tests/files/elife_article.html", "r") as f:
elife_html = f.read()
- meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html))
+ meta = html_extract_biblio(
+ "https://elifesciences.org/articles/44753", HTMLParser(elife_html)
+ )
assert meta is not None
assert meta.title == "Parallel visual circuitry in a basal chordate"
assert meta.doi == "10.7554/eLife.44753"
@@ -64,28 +79,34 @@ def test_html_metadata_elife() -> None:
# 2019-04-18
assert meta.release_date == datetime.date(year=2019, month=4, day=18)
assert meta.publisher == "eLife Sciences Publications Limited"
- assert meta.pdf_fulltext_url == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+ )
def test_html_metadata_peerj() -> None:
-
- with open('tests/files/peerj_oa_article.html', 'r') as f:
+
+ with open("tests/files/peerj_oa_article.html", "r") as f:
peerj_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
assert meta is not None
- assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ assert (
+ meta.title
+ == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ )
assert meta.doi == "10.7717/peerj.4375"
assert meta.contrib_names == [
- "Heather Piwowar",
- "Jason Priem",
- "Vincent Larivière",
- "Juan Pablo Alperin",
- "Lisa Matthias",
- "Bree Norlander",
- "Ashley Farley",
- "Jevin West",
- "Stefanie Haustein",
+ "Heather Piwowar",
+ "Jason Priem",
+ "Vincent Larivière",
+ "Juan Pablo Alperin",
+ "Lisa Matthias",
+ "Bree Norlander",
+ "Ashley Farley",
+ "Jevin West",
+ "Stefanie Haustein",
]
assert meta.container_name == "PeerJ"
# "2018-02-13"
@@ -95,7 +116,7 @@ def test_html_metadata_peerj() -> None:
def test_html_metadata_nature() -> None:
- with open('tests/files/nature_article.html', 'r') as f:
+ with open("tests/files/nature_article.html", "r") as f:
nature_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
@@ -110,12 +131,15 @@ def test_html_metadata_nature() -> None:
assert meta.release_date == datetime.date(year=2020, month=9, day=10)
assert meta.publisher == "Nature Publishing Group"
# note: some error in dublin code in nature HTML resulting in duplication
- assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+ assert (
+ meta.abstract
+ == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+ )
def test_html_metadata_ojs3() -> None:
- with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
ojs3_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
@@ -128,19 +152,25 @@ def test_html_metadata_ojs3() -> None:
"Os Keyes",
]
assert meta.container_name == "First Monday"
- assert meta.container_abbrev == "1" # NOTE: bad source metadata
+ assert meta.container_abbrev == "1" # NOTE: bad source metadata
assert meta.container_issn == "1396-0466"
# "2020/09/10"
assert meta.release_date == datetime.date(year=2020, month=9, day=10)
assert meta.lang == "en"
- assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
- assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ assert (
+ meta.abstract
+ == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
+ )
+ assert (
+ meta.html_fulltext_url
+ == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ )
assert meta.release_type == "article-journal"
def test_html_metadata_dlib() -> None:
- with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
dlib_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
@@ -149,6 +179,7 @@ def test_html_metadata_dlib() -> None:
# "2017-05-15"
assert meta.release_date == datetime.date(year=2017, month=5, day=15)
+
def test_html_metadata_dc_case() -> None:
"""
This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
@@ -166,13 +197,15 @@ def test_html_metadata_dc_case() -> None:
assert meta is not None
assert meta.issue == "123"
+
@pytest.fixture
def adblock() -> Any:
return load_adblock_rules()
+
def test_html_resources(adblock) -> None:
- with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
dlib_html = f.read()
resources = html_extract_resources(
@@ -185,9 +218,9 @@ def test_html_resources(adblock) -> None:
# check that adblock working
for r in resources:
- assert '/ga.js' not in r['url']
+ assert "/ga.js" not in r["url"]
- with open('tests/files/plos_one_article.html', 'r') as f:
+ with open("tests/files/plos_one_article.html", "r") as f:
plos_html = f.read()
resources = html_extract_resources(
@@ -198,9 +231,9 @@ def test_html_resources(adblock) -> None:
# check that custom adblock working
for r in resources:
- assert 'crossmark-cdn.crossref.org' not in r['url']
+ assert "crossmark-cdn.crossref.org" not in r["url"]
- with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
monday_html = f.read()
resources = html_extract_resources(
@@ -209,7 +242,7 @@ def test_html_resources(adblock) -> None:
adblock,
)
- with open('tests/files/elife_article.html', 'r') as f:
+ with open("tests/files/elife_article.html", "r") as f:
elife_html = f.read()
resources = html_extract_resources(
@@ -218,7 +251,7 @@ def test_html_resources(adblock) -> None:
adblock,
)
- with open('tests/files/nature_article.html', 'r') as f:
+ with open("tests/files/nature_article.html", "r") as f:
nature_html = f.read()
resources = html_extract_resources(
@@ -226,4 +259,3 @@ def test_html_resources(adblock) -> None:
HTMLParser(nature_html),
adblock,
)
-
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 46346b7..e14a452 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -1,12 +1,12 @@
-
import json
+
import pytest
import responses
+from test_grobid import REAL_TEI_XML
+from test_savepagenow import *
+from test_wayback import *
from sandcrawler import *
-from test_wayback import *
-from test_savepagenow import *
-from test_grobid import REAL_TEI_XML
@pytest.fixture
@@ -21,6 +21,7 @@ def ingest_worker(wayback_client, spn_client):
)
return worker
+
@pytest.fixture
def ingest_worker_pdf(wayback_client_pdf, spn_client):
grobid_client = GrobidClient(
@@ -41,153 +42,223 @@ def ingest_worker_pdf(wayback_client_pdf, spn_client):
@responses.activate
def test_ingest_success(ingest_worker_pdf):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=pdf_bytes)
- responses.add(responses.GET,
- 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ body=pdf_bytes,
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/grobid?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
status=200,
- body=json.dumps([]))
- responses.add(responses.GET,
- 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
status=200,
- body=json.dumps([]))
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
resp = ingest_worker_pdf.process(request)
print(resp)
- assert resp['hit'] == True
- assert resp['status'] == "success"
- assert resp['request'] == request
- assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
- assert type(resp['terminal']['terminal_dt']) == str
- assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
- assert resp['terminal']['terminal_status_code']
- assert type(resp['file_meta']['size_bytes']) == int
- assert resp['file_meta']['mimetype'] == "application/pdf"
- assert resp['cdx']['url'] == TARGET + "/redirect"
- assert 'warc_path' not in resp['cdx']
- assert 'revisit_cdx' not in resp
- assert resp['grobid']['status'] == "success"
- assert resp['grobid']['status_code'] == 200
- assert resp['grobid']['grobid_version']
- assert 'fatcat_release' in resp['grobid']
- assert 'grobid_version' not in resp['grobid']['metadata']
- assert 'fatcat_release' not in resp['grobid']['metadata']
- assert not 'tei_xml' in resp['grobid']
- assert resp['pdf_meta']['status'] == "success"
- assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
- assert resp['pdf_meta'].get('text') is None
+ assert resp["hit"] is True
+ assert resp["status"] == "success"
+ assert resp["request"] == request
+ assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"]
+ assert type(resp["terminal"]["terminal_dt"]) == str
+ assert resp["terminal"]["terminal_url"] == TARGET + "/redirect"
+ assert resp["terminal"]["terminal_status_code"]
+ assert type(resp["file_meta"]["size_bytes"]) == int
+ assert resp["file_meta"]["mimetype"] == "application/pdf"
+ assert resp["cdx"]["url"] == TARGET + "/redirect"
+ assert "warc_path" not in resp["cdx"]
+ assert "revisit_cdx" not in resp
+ assert resp["grobid"]["status"] == "success"
+ assert resp["grobid"]["status_code"] == 200
+ assert resp["grobid"]["grobid_version"]
+ assert "fatcat_release" in resp["grobid"]
+ assert "grobid_version" not in resp["grobid"]["metadata"]
+ assert "fatcat_release" not in resp["grobid"]["metadata"]
+ assert "tei_xml" not in resp["grobid"]
+ assert resp["pdf_meta"]["status"] == "success"
+ assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1
+ assert resp["pdf_meta"].get("text") is None
+
@responses.activate
def test_ingest_landing(ingest_worker):
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ body=WARC_BODY,
+ )
# this is for second time around; don't want to fetch same landing page
# HTML again and result in a loop
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body="<html></html>")
+ body="<html></html>",
+ )
resp = ingest_worker.process(request)
print(resp)
- assert resp['hit'] == False
- assert resp['status'] == "no-pdf-link"
- assert resp['request'] == request
- assert 'terminal' in resp
- assert 'file_meta' not in resp
- assert 'cdx' not in resp
- assert 'revisit_cdx' not in resp
- assert 'grobid' not in resp
+ assert resp["hit"] is False
+ assert resp["status"] == "no-pdf-link"
+ assert resp["request"] == request
+ assert "terminal" in resp
+ assert "file_meta" not in resp
+ assert "cdx" not in resp
+ assert "revisit_cdx" not in resp
+ assert "grobid" not in resp
+
@responses.activate
def test_ingest_blocklist(ingest_worker):
ingest_worker.base_url_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] == False
- assert resp['status'] == "skip-url-blocklist"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-url-blocklist"
+ assert resp["request"] == request
@responses.activate
def test_ingest_wall_blocklist(ingest_worker):
ingest_worker.wall_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] == False
- assert resp['status'] == "skip-wall"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-wall"
+ assert resp["request"] == request
+
+
+@responses.activate
+def test_ingest_cookie_blocklist(ingest_worker):
+
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/cookieAbsent",
+ }
+
+ resp = ingest_worker.process(request)
+ assert resp["hit"] is False
+ assert resp["status"] == "blocked-cookie"
+ assert resp["request"] == request
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 429c6b0..9bd8b5f 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -1,4 +1,3 @@
-
"""
This file contains tests to run against "live" wayback services. They default
to "skip" because you need authentication, and we shouldn't hit these services
@@ -7,10 +6,9 @@ automatically in CI.
Simply uncomment lines to run.
"""
-import json
import pytest
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError, SavePageNowClient, SavePageNowError, CdxPartial, gen_file_metadata
+from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata
@pytest.fixture
@@ -18,16 +16,19 @@ def cdx_client():
client = CdxApiClient()
return client
+
@pytest.fixture
def wayback_client():
client = WaybackClient()
return client
+
@pytest.fixture
def spn_client():
client = SavePageNowClient()
return client
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch(cdx_client):
@@ -42,12 +43,16 @@ def test_cdx_fetch(cdx_client):
assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV"
assert resp.warc_csize == 25338
assert resp.warc_offset == 240665973
- assert resp.warc_path == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ assert (
+ resp.warc_path
+ == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ )
# bogus datetime; shouldn't match
with pytest.raises(KeyError):
resp = cdx_client.fetch(url, "12345678123456")
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_lookup_best(cdx_client):
@@ -66,24 +71,31 @@ def test_cdx_lookup_best(cdx_client):
assert resp.mimetype == "text/html"
assert resp.status_code == 200
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_wayback_fetch(wayback_client):
- resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz")
+ resp = wayback_client.fetch_petabox(
+ 25683,
+ 2676464871,
+ "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz",
+ )
assert resp.body
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_resource_success(wayback_client):
url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url in (url, url.replace("https://", "http://"))
assert resp.cdx.url in (url, url.replace("https://", "http://"))
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch_spn2(cdx_client):
@@ -104,9 +116,9 @@ def test_cdx_fetch_spn2(cdx_client):
# https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410
- #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
datetime = "20200110222410"
@@ -117,6 +129,7 @@ def test_cdx_fetch_spn2(cdx_client):
assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL"
assert resp.status_code == 200
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_ftp(wayback_client):
# ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
@@ -127,29 +140,30 @@ def test_lookup_ftp(wayback_client):
url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url == url
- assert resp.terminal_status_code == 226
+ assert resp.terminal_status_code in (226, 200)
assert resp.cdx.url == url
assert resp.revisit_cdx
assert resp.revisit_cdx.url != url
file_meta = gen_file_metadata(resp.body)
- assert file_meta['sha1hex'] == resp.cdx.sha1hex
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
# not revisit?
url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url == url
- assert resp.terminal_status_code == 226
+ assert resp.terminal_status_code in (226, 200)
assert resp.cdx.url == url
file_meta = gen_file_metadata(resp.body)
- assert file_meta['sha1hex'] == resp.cdx.sha1hex
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_crawl_ftp(spn_client, wayback_client):
@@ -158,10 +172,10 @@ def test_crawl_ftp(spn_client, wayback_client):
resp = spn_client.crawl_resource(url, wayback_client)
# FTP isn't supported yet!
- #assert resp.hit == True
- #assert resp.status == "success"
- #assert resp.terminal_url == url
- #assert resp.cdx.url == url
+ # assert resp.hit is True
+ # assert resp.status == "success"
+ # assert resp.terminal_url == url
+ # assert resp.cdx.url == url
- assert resp.hit == False
+ assert resp.hit is False
assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 29f9e9f..2bad851 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,77 +1,110 @@
-
import pytest
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
+from sandcrawler import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_line,
+)
+
def test_gen_file_metadata():
-
+
# valid (but very small) PDF file
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
file_meta = gen_file_metadata(f.read())
assert file_meta == {
- 'mimetype': 'application/pdf',
- 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
- 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
- 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
- 'size_bytes': 13264,
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
}
# valid HTML
fm = gen_file_metadata(
- b"""<html><head><title>dummy</title></head><body>html document</body></html>""")
- assert fm['mimetype'] == 'text/html'
+ b"""<html><head><title>dummy</title></head><body>html document</body></html>"""
+ )
+ assert fm["mimetype"] == "text/html"
# bogus text
fm = gen_file_metadata(b"asdf1234")
- assert fm['mimetype'] == 'text/plain'
- assert fm['size_bytes'] == 8
+ assert fm["mimetype"] == "text/plain"
+ assert fm["size_bytes"] == 8
+
+
+def test_gen_file_metadata_path():
+
+ # valid (but very small) PDF file
+ file_meta = gen_file_metadata_path("tests/files/dummy.pdf")
+ assert file_meta == {
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
+ }
+
def test_b32_hex():
# valid b32
- assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
- assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert (
+ b32_hex("sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
+ assert (
+ b32_hex("TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
# sha1hex pass-through
- s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
+ s = "bda3c1017d52e826bbd1da51efad877272d300f9"
assert b32_hex(s) == s
# invalid
with pytest.raises(ValueError):
- assert b32_hex('blah') == 'blah'
+ assert b32_hex("blah") == "blah"
+
def test_parse_cdx_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
correct = {
- 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
- 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
- 'mimetype': "application/pdf",
- 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'datetime': "20170828233154",
- 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'warc_offset': 931661233,
- 'warc_csize': 210251,
- 'http_status': 200,
+ "sha1b32": "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ "sha1hex": "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ "mimetype": "application/pdf",
+ "surt": "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "url": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "datetime": "20170828233154",
+ "warc_path": "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "warc_offset": 931661233,
+ "warc_csize": 210251,
+ "http_status": 200,
}
assert parse_cdx_line(raw) == correct
assert parse_cdx_line(raw + "\n") == correct
assert parse_cdx_line(raw + " extra_field") == correct
+
def test_invalid_cdx():
print("missing warc")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
- assert parse_cdx_line(raw) == None
+ assert parse_cdx_line(raw) is None
print("bad datetime")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- assert parse_cdx_line(raw) == None
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) is None
+
def test_clean_url():
assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
- assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
- "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
-
+ assert (
+ clean_url(
+ "https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
+ == "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 255e3fb..9d75655 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -1,68 +1,71 @@
-
-import pytest
import struct
-import responses
+
import poppler
+import pytest
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
from sandcrawler.pdfextract import process_pdf
-from test_wayback import wayback_client, cdx_client
-
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+
def test_process_fake_pdf():
resp = process_pdf(FAKE_PDF_BYTES)
print(resp)
assert resp.status == "not-pdf"
- with open('tests/files/dummy_zip.zip', 'rb') as f:
+ with open("tests/files/dummy_zip.zip", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
- assert resp.status == 'not-pdf'
+ assert resp.status == "not-pdf"
+
-@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler")
+@pytest.mark.skipif(
+ poppler.version_string() == "0.71.0", reason="unsupported version of poppler"
+)
def test_process_dummy_pdf():
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
- assert resp.status == 'success'
+ assert resp.status == "success"
assert resp.page0_thumbnail is not None
assert len(resp.text) > 10
assert resp.meta_xml is None
- assert resp.file_meta['mimetype'] == 'application/pdf'
+ assert resp.file_meta["mimetype"] == "application/pdf"
print(resp.pdf_info)
print(resp.pdf_extra)
- assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis"
+ assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis"
# 595 x 842
- assert resp.pdf_extra['page0_height'] == 842
- assert resp.pdf_extra['page0_width'] == 595
- assert resp.pdf_extra['page_count'] == 1
+ assert resp.pdf_extra["page0_height"] == 842
+ assert resp.pdf_extra["page0_width"] == 595
+ assert resp.pdf_extra["page_count"] == 1
+
-def test_pdfextract_worker_cdx(wayback_client):
+def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
sink = BlackholeSink()
worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
def test_pdfextract_blob_worker():
sink = BlackholeSink()
worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
worker.process(pdf_bytes)
-
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 52f26c0..ed17d24 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,7 +1,4 @@
-
-import pytest
-
-from sandcrawler.workers import CdxLinePusher, BlackholeSink
+from sandcrawler.workers import BlackholeSink, CdxLinePusher
def test_cdx_line_pusher():
@@ -9,20 +6,24 @@ def test_cdx_line_pusher():
sink = BlackholeSink()
# vanilla (only default filters)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(sink, cdx_file)
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['pushed'] == 19
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["pushed"] == 19
# HTTP 200 and application/pdf
- with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(sink, cdx_file,
- filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226])
+ with open("tests/files/example.cdx", "r") as cdx_file:
+ pusher = CdxLinePusher(
+ sink,
+ cdx_file,
+ filter_mimetypes=["application/pdf"],
+ filter_http_statuses=[200, 226],
+ )
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['skip-http_status'] == 10
- assert counts['skip-mimetype'] == 2
- assert counts['pushed'] == 7
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["skip-http_status"] == 10
+ assert counts["skip-mimetype"] == 2
+ assert counts["pushed"] == 7
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 63dd887..add2c60 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -1,11 +1,10 @@
-
import json
+
import pytest
import responses
-
-from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial
from test_wayback import *
+from sandcrawler import CdxPartial, SavePageNowBackoffError, SavePageNowClient, SavePageNowError
TARGET = "http://dummy-target.dummy"
JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
@@ -16,7 +15,7 @@ PENDING_BODY = {
"https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js",
"https://cdn.onesignal.com/sdks/OneSignalSDK.js",
- ]
+ ],
}
SUCCESS_BODY = {
"status": "success",
@@ -58,12 +57,12 @@ SUCCESS_BODY = {
"https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png",
"https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1",
"https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
- "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4"
+ "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
],
- "outlinks":{
+ "outlinks": {
"https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
- "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695"
- }
+ "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695",
+ },
}
ERROR_BODY = {
"status": "error",
@@ -71,13 +70,38 @@ ERROR_BODY = {
"status_ext": "error:invalid-host-resolution",
"job_id": JOB_ID,
"message": "Couldn't resolve host for http://example5123.com.",
- "resources": []
+ "resources": [],
}
CDX_SPN_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180326070330",
+ TARGET + "/redirect",
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz",
+ ],
]
+
@pytest.fixture
def spn_client():
client = SavePageNowClient(
@@ -88,112 +112,216 @@ def spn_client():
client.poll_seconds = 0.0
return client
+
@responses.activate
def test_savepagenow_success(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
+ body=json.dumps(SUCCESS_BODY),
+ )
resp = spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 4
+ assert len(responses.calls) == 5
- assert resp.success == True
+ assert resp.success is True
assert resp.status == "success"
assert resp.request_url == TARGET
assert resp.terminal_url == TARGET + "/redirect"
- assert resp.terminal_dt == SUCCESS_BODY['timestamp']
- assert resp.resources == SUCCESS_BODY['resources']
+ assert resp.terminal_dt == SUCCESS_BODY["timestamp"]
+ assert resp.resources == SUCCESS_BODY["resources"]
+
@responses.activate
def test_savepagenow_remote_error(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(ERROR_BODY))
+ body=json.dumps(ERROR_BODY),
+ )
resp = spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 3
+ assert len(responses.calls) == 4
- assert resp.success == False
- assert resp.status == ERROR_BODY['status_ext']
+ assert resp.success is False
+ assert resp.status == ERROR_BODY["status_ext"]
assert resp.request_url == TARGET
- assert resp.terminal_url == None
- assert resp.terminal_dt == None
- assert resp.resources == None
+ assert resp.terminal_url is None
+ assert resp.terminal_dt is None
+ assert resp.resources is None
+
@responses.activate
def test_savepagenow_500(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=500,
- body=json.dumps(ERROR_BODY))
+ body=json.dumps(ERROR_BODY),
+ )
with pytest.raises(SavePageNowError):
- resp = spn_client.save_url_now_v2(TARGET)
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 3
+
+
+@responses.activate
+def test_savepagenow_no_slots(spn_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 0,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+
+ with pytest.raises(SavePageNowBackoffError):
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 1
- assert len(responses.calls) == 2
@responses.activate
def test_crawl_resource(spn_client, wayback_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ body=WARC_BODY,
+ )
- print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"))
+ print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"))
resp = spn_client.crawl_resource(TARGET, wayback_client)
- assert len(responses.calls) == 5
+ assert len(responses.calls) == 6
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.body == WARC_BODY
assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
@@ -201,4 +329,3 @@ def test_crawl_resource(spn_client, wayback_client):
assert type(resp.cdx) == CdxPartial
with pytest.raises(AttributeError):
print(resp.cdx.warc_path)
-
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 6bc1ca4..da4dfd8 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -1,36 +1,156 @@
-
import json
+
import pytest
import responses
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
-
+from sandcrawler import CdxApiClient, WaybackClient
CDX_TARGET = "http://fatcat.wiki/"
CDX_DT = "20180812220054"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_SINGLE_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
]
CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_MULTI_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner, but not right mimetype
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner and mimetype, but wrong status code
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # "best"
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # older
- ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # sooner, but not right mimetype
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # sooner and mimetype, but wrong status code
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "400",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "500",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "150",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # "best"
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # older
+ [
+ "wiki,fatcat)/",
+ "20180712220054",
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
]
+
@pytest.fixture
def cdx_client():
client = CdxApiClient(
@@ -39,13 +159,13 @@ def cdx_client():
)
return client
+
@responses.activate
def test_cdx_fetch(cdx_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
@@ -58,16 +178,16 @@ def test_cdx_fetch(cdx_client):
assert resp.warc_offset == 108062304
assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
@responses.activate
def test_cdx_fetch_errors(cdx_client):
with pytest.raises(ValueError):
resp = cdx_client.fetch(CDX_TARGET, "2019")
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
with pytest.raises(KeyError):
resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
@@ -77,14 +197,15 @@ def test_cdx_fetch_errors(cdx_client):
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
assert len(responses.calls) == 3
+ assert resp
+
@responses.activate
def test_cdx_lookup_best(cdx_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
@@ -95,6 +216,7 @@ def test_cdx_lookup_best(cdx_client):
assert resp.sha1b32 == CDX_BEST_SHA1B32
assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
WARC_TARGET = "http://fatcat.wiki/"
WARC_BODY = b"""
<html>
@@ -108,6 +230,7 @@ WARC_BODY = b"""
</html>
"""
+
@pytest.fixture
def wayback_client(cdx_client, mocker):
client = WaybackClient(
@@ -127,10 +250,11 @@ def wayback_client(cdx_client, mocker):
return client
+
@pytest.fixture
def wayback_client_pdf(cdx_client, mocker):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
client = WaybackClient(
@@ -150,6 +274,7 @@ def wayback_client_pdf(cdx_client, mocker):
return client
+
@responses.activate
def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
@@ -159,14 +284,14 @@ def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
assert resp == WARC_BODY
+
@responses.activate
def test_lookup_resource_success(wayback_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
resp = wayback_client.lookup_resource(CDX_TARGET)
- assert resp.hit == True
+ assert resp.hit is True
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
index a996c56..786f863 100644
--- a/python/tests/test_xml.py
+++ b/python/tests/test_xml.py
@@ -1,12 +1,11 @@
-
import pytest
from sandcrawler.xml import xml_reserialize
def test_xml_reserialize() -> None:
-
- with open('tests/files/scielo_article.jats.xml', 'rb') as f:
+
+ with open("tests/files/scielo_article.jats.xml", "rb") as f:
raw_xml = f.read()
assert b'encoding="ISO-8859-1"' in raw_xml
diff --git a/python_hadoop/README.md b/python_hadoop/README.md
index 198c949..7866480 100644
--- a/python_hadoop/README.md
+++ b/python_hadoop/README.md
@@ -68,7 +68,7 @@ running on a devbox and GROBID running on a dedicated machine:
./extraction_cdx_grobid.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
tests/files/example.cdx
@@ -76,7 +76,7 @@ Running from the cluster (once a ./venv-current.tar.gz tarball exists):
./extraction_cdx_grobid.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
-r hadoop \
-c mrjob.conf \
@@ -90,13 +90,13 @@ running on a devbox:
./backfill_hbase_from_cdx.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
tests/files/example.cdx
Running from the cluster (once a ./venv-current.tar.gz tarball exists):
./backfill_hbase_from_cdx.py \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--hbase-table wbgrp-journal-extract-0-qa \
-r hadoop \
-c mrjob.conf \
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
new file mode 100644
index 0000000..963fb10
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
@@ -0,0 +1,187 @@
+package sandcrawler
+
+import java.util.Properties
+
+import scala.util.Try
+import scala.util.matching.Regex
+import scala.util.parsing.json.JSONObject
+
+import cascading.pipe.joiner._
+import cascading.property.AppProps
+import cascading.tap.SinkMode
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Type that represents a raw parsed CDX line
+case class CdxLine(surt: String, datetime: String, url: String, mime: String, httpStatus: String, sha1: String, c_size: String, offset: String, warc: String)
+
+/**
+ * CDX backfill:
+ * 1. parse CDX (all columns)
+ * 2. filter CDX (pdf, HTTP 200, etc)
+ * 3. source HBase (key column only)
+ * 4. left join CDX to HBase
+ * 5. filter to only those with null HBase key column
+ * 6. convert CDX fields to HBase columns
+ * 7. sink results to HBase
+ */
+class CdxBackfillJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ import CdxBackfillJob._
+
+ val hbaseSource = getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
+ val hbaseSink = getHBaseSink(args("hbase-table"), args("zookeeper-hosts"))
+
+ // Parse CDX lines from text file to typed pipe
+ val lines : TypedPipe[String] = TypedPipe.from(TextLine(args("cdx-input-path")))
+
+ val cdxLines : TypedPipe[CdxLine] = lines
+ .filter { isCdxLine }
+ .map { lineToCdxLine }
+ .filter { CdxBackfillJob.keepCdx(_) }
+
+ // (key, f:c, file:cdx, file:mime)
+ val cdxRows : TypedPipe[(String, String, String, String)] = cdxLines
+ .map { CdxBackfillJob.cdxLineToRow }
+ .debug
+
+ val existingKeys : TypedPipe[String] = hbaseSource
+ .read
+ .fromBytesWritable( new Fields("key") )
+ .toTypedPipe[String]('key)
+ //.debug
+
+ // filters out all the lines that have an existing SHA1 key in HBase
+ // the groupBy statements are to select key values to join on.
+ // (key, f:c, file:cdx, file:mime)
+ val newRows : TypedPipe[(String, String, String, String)] = existingKeys
+ .groupBy( identity )
+ .rightJoin(cdxRows.groupBy(_._1))
+ .toTypedPipe
+ .collect { case (_, (None, row)) => row }
+ .debug
+
+ // convert to tuple form and write out into HBase
+ newRows
+ .toPipe('key, 'c, 'cdx, 'mime)
+ .toBytesWritable( new Fields("key", "c", "cdx", "mime") )
+ .write(hbaseSink)
+
+}
+
+object CdxBackfillJob {
+
+ def getHBaseSource(hbase_table: String, zookeeper_hosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbase_table,
+ zookeeper_hosts,
+ List("file:size"), // not actually needed
+ SourceMode.SCAN_ALL)
+ }
+
+ def getHBaseSink(hbase_table: String, zookeeper_hosts: String) : HBaseSource = {
+ HBaseBuilder.buildSink(
+ hbase_table,
+ zookeeper_hosts,
+ List("f:c", "file:cdx", "file:mime"),
+ SinkMode.UPDATE)
+ }
+
+ def normalizeMime(raw: String) : String = {
+
+ val normalMime = Map(
+ "application/pdf" -> "application/pdf",
+ "application/x-pdf" -> "application/pdf",
+ "('application/pdf'" -> "application/pdf",
+ "image/pdf" -> "application/pdf",
+ "text/pdf" -> "application/pdf",
+ "\"application/pdf\"" -> "application/pdf",
+ "application/postscript" -> "application/postscript",
+ "text/html" -> "text/html",
+ "text/xml" -> "text/xml",
+ "application/xml" -> "text/xml"
+ )
+
+ val lower = raw.toLowerCase()
+ normalMime.find { case (key, _) =>
+ lower.startsWith(key)
+ } match {
+ case Some((_, value)) => value
+ case None => lower
+ }
+ }
+
+ def isCdxLine(line: String) : Boolean = {
+ // malformatted or non-CDX11 lines
+ !(line.startsWith("#") || line.startsWith(" ") || line.startsWith("filedesc") ||
+ line.split(" ").size != 11)
+ }
+
+ def keepCdx(line: CdxLine) : Boolean = {
+ val sha1Pattern = """[A-Z2-7]{32}""".r
+ if (List(line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc).contains("-")) {
+ false
+ } else if (line.httpStatus != "200") {
+ false
+ } else if (line.mime != "application/pdf") {
+ false
+ } else if (sha1Pattern.unapplySeq(line.sha1).isEmpty) {
+ false
+ } else if (List(line.c_size, line.offset, line.datetime).map(s => Try(s.toLong).toOption).contains(None)) {
+ false
+ } else {
+ true
+ }
+ }
+
+ // Returns (key, f:c, file:cdx, file:mime), all as strings, which is close to
+ // how they will be inserted into HBase
+ def cdxLineToRow(line: CdxLine) : (String, String, String, String) = {
+
+ val key = "sha1:" + line.sha1
+
+ val warcFile = line.warc.split('/')(1)
+
+ // Read CDX-style datetime and conver to ISO 8601 with second resolution
+ val dtFormat = new java.text.SimpleDateFormat("yyyyMMddHHmmss")
+ val isoFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'")
+ // TODO: timezones? UTC to UTC, so I don't think so.
+ val dtIso = isoFormat.format(dtFormat.parse(line.datetime))
+
+ // This is the "f:c" field. 'i' intentionally not set
+ // python: f:c = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
+ // python: warc_file = warc.split('/')[-1]
+ // python: dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
+ val heritrixInfo = JSONObject(Map(
+ "u" -> line.url,
+ "d" -> dtIso,
+ "f" -> warcFile,
+ "o" -> line.offset.toInt,
+ "c" -> line.c_size.toInt
+ ))
+
+ // python: dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
+ // offset=int(offset), warc=warc)
+ val fileCdx = JSONObject(Map(
+ "surt" -> line.surt,
+ "dt" -> line.datetime,
+ "url" -> line.url,
+ "c_size" -> line.c_size.toInt,
+ "offset" -> line.offset.toInt,
+ "warc" -> line.warc
+ ))
+ (key, heritrixInfo.toString(), fileCdx.toString(), normalizeMime(line.mime))
+ }
+
+ def lineToCdxLine(line: String) : CdxLine = {
+ val raw = line.split("\\s+")
+ // surt, datetime, url, mime, http_status, sha1, SKIP, SKIP, c_size, offset, warc
+ CdxLine(raw(0), raw(1), raw(2), raw(3), raw(4), raw(5), raw(8), raw(9), raw(10))
+ }
+
+}
diff --git a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
new file mode 100644
index 0000000..c092f7f
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
@@ -0,0 +1,175 @@
+
+package sandcrawler
+
+import org.scalatest._
+import cascading.tuple.{Tuple, Fields}
+import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions, TextLine}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.junit.runner.RunWith
+import org.scalatest.FunSpec
+import org.scalatest.junit.JUnitRunner
+import org.slf4j.LoggerFactory
+import parallelai.spyglass.hbase.HBaseSource
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import scala.util.parsing.json.JSON
+
+class CdxBackfillTest extends FlatSpec with Matchers {
+
+ import CdxBackfillJob._
+
+ it should "normalize mimetypes" in {
+ assert(CdxBackfillJob.normalizeMime("asdf") === "asdf")
+ assert(CdxBackfillJob.normalizeMime("application/pdf") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/pdf+journal") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("Application/PDF") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/p") === "application/p")
+ assert(CdxBackfillJob.normalizeMime("application/xml+stuff") === "text/xml")
+ assert(CdxBackfillJob.normalizeMime("application/x-pdf") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/x-html") === "application/x-html")
+ }
+
+ it should "filter CDX lines" in {
+ assert(true === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ // redirect
+ assert(false === keepCdx(lineToCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")))
+ // not PDF
+ assert(false === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf text/plain 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ // invalid base32 SHA1
+ assert(false === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FE010101010101010101VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ assert(false === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL33FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ // dashed field
+ assert(false === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 - application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ }
+
+ it should "know what CDX lines are" in {
+ assert(true === isCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ assert(false === isCdxLine(""))
+ assert(false === isCdxLine(
+ " edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ assert(false === isCdxLine(
+ "#edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ // missing two fields
+ assert(false === isCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ // extra field
+ assert(false === isCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz -"))
+ }
+
+ it should "execute lineToRow" in {
+ // this particular test copied from python test_backfill_hbase_from_cdx.py
+ val row = cdxLineToRow(lineToCdxLine(
+ "eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=761393014319a39f40d32ae3eb3a853f?sequence=1 20170705062202 http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1 application/PDF 200 MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J - - 854156 328850624 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+
+ assert(row._1 == "sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J")
+ JSON.parseFull(row._2) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("u") == "http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1")
+ assert(obj("f") == "CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ assert(obj("c") == 854156)
+ assert(obj("o") == 328850624)
+ assert(obj("d") == "2017-07-05T06:22:02Z")
+ }
+ case other => assert(false)
+ }
+ JSON.parseFull(row._3) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("surt") == "eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=761393014319a39f40d32ae3eb3a853f?sequence=1")
+ assert(obj("dt") == "20170705062202")
+ assert(obj("url") == "http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1")
+ assert(obj("c_size") == 854156)
+ assert(obj("offset") == 328850624)
+ assert(obj("warc") == "CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ }
+ case other => assert(false)
+ }
+ assert(row._4 == "application/pdf")
+ }
+
+}
+
+@RunWith(classOf[JUnitRunner])
+class CdxBackfillJobTest extends FunSpec with TupleConversions {
+
+ val (testTable, testHost, testCdxFile) = ("test-table", "dummy-host:2181", "test_file.cdx")
+
+ val log = LoggerFactory.getLogger(this.getClass.getName)
+
+ val dummySizeBytes = Bytes.toBytes(100)
+
+ val sampleData = List(
+ List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), dummySizeBytes),
+ List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), dummySizeBytes),
+ List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), dummySizeBytes),
+ List(Bytes.toBytes("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT"), dummySizeBytes)
+ )
+ val sampleCdxLines = List(
+ // clean line
+ "0" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
+ // has existing SHA1
+ "1" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
+ // HTTP status code
+ "2" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
+ // not CDX (prefixed with hash)
+ "3" -> """#edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
+ // not PDF
+ "4" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/film 200 AAAAAEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"""
+ )
+
+ JobTest("sandcrawler.CdxBackfillJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("cdx-input-path", testCdxFile)
+ .arg("debug", "true")
+ .source[Tuple](CdxBackfillJob.getHBaseSource(testTable, testHost),
+ sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .source(TextLine(testCdxFile), sampleCdxLines)
+ .sink[(ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable)](CdxBackfillJob.getHBaseSink(testTable, testHost)) {
+ outputBuffer =>
+
+ val buf0 = outputBuffer(0)
+ val row0 = List(buf0._1, buf0._2, buf0._3, buf0._4).map(b => Bytes.toString(b.copyBytes()))
+
+ it("should return a 1-element list (after join).") {
+ assert(outputBuffer.size === 1)
+ }
+
+ it("should insert the valid, new CDX line") {
+ assert(row0(0) == "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G")
+ JSON.parseFull(row0(1)) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("u") == "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("f") == "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ assert(obj("c") == 210251)
+ assert(obj("o") == 931661233)
+ assert(obj("d") == "2017-08-28T23:31:54Z")
+ }
+ case other => assert(false)
+ }
+ JSON.parseFull(row0(2)) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("surt") == "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("dt") == "20170828233154")
+ assert(obj("url") == "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("c_size") == 210251)
+ assert(obj("offset") == 931661233)
+ assert(obj("warc") == "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ }
+ case other => assert(false)
+ }
+ assert(row0(3) == "application/pdf")
+ }
+ }
+ .run
+ .finish
+}
diff --git a/sql/Makefile b/sql/Makefile
new file mode 100644
index 0000000..860addb
--- /dev/null
+++ b/sql/Makefile
@@ -0,0 +1,35 @@
+
+SHELL=/bin/bash -euo pipefail
+TODAY ?= $(shell date --iso --utc)
+DATADIR ?= /srv/sandcrawler/tasks/$(TODAY)
+DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S)
+DATABASE_URL ?= sandcrawler
+
+.PHONY: help
+help: ## Print info about all commands
+ @echo "Commands:"
+ @echo
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: create_datadir
+create_datadir:
+ mkdir -p $(DATADIR)/
+ sudo chmod a+rw $(DATADIR)/
+
+$(DATADIR)/.DB_DUMP:
+ sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip
+ mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump
+ touch $@
+
+.PHONY: database-snapshot
+database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot
+ @echo
+
+$(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP
+ ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))"
+ ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump
+ touch $@
+
+.PHONY: upload-database-snapshot
+upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org
+ @echo
diff --git a/sql/README.md b/sql/README.md
index 42dba31..e488006 100644
--- a/sql/README.md
+++ b/sql/README.md
@@ -139,10 +139,30 @@ Questions we might want to answer
http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES
http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278
-## Full Database Dumps
-Run a dump in compressed, postgres custom format:
+## Full SQL Database Dumps
+
+Run a dump in compressed, postgres custom format, not including `crossref` table (which is large and redundant):
export DATESLUG="`date +%Y-%m-%d.%H%M%S`"
- time sudo -u postgres pg_dump --verbose --format=custom sandcrawler > /sandcrawler-db/snapshots/sandcrawler_full_dbdump_${DATESLUG}.pgdump
+ time sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > sandcrawler_full_dbdump_${DATESLUG}.pgdump
+
+As of 2021-12-03, this process runs for about 6 hours and the compressed
+snapshot is 102 GBytes (compared with 940GB database disk consumption,
+including crossref).
+
+Then, upload to petabox as a backup:
+
+ ia upload sandcrawler_full_dbdump_YYYY-MM-DD -m mediatype:data -m collection:webgroup-internal-backups -m title:"Sandcrawler SQL Dump (YYYY-MM-DD)" sandcrawler_full_dbdump_${DATESLUG}.pgdump
+
+
+## SQL Database Restore
+
+To restore a dump (which will delete local database content, if any):
+
+ sudo su postgres
+ createuser --no-login web_anon
+ createuser -s sandcrawler
+ time pg_restore --jobs=4 --verbose --clean --if-exists --create --exit-on-error -d postgres sandcrawler_full_dbdump_2021-04-08.003952.pgdump
+Took about 2.5 hours.
diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md
index f1a5f86..4a56065 100644
--- a/sql/backfill/backfill.md
+++ b/sql/backfill/backfill.md
@@ -76,6 +76,19 @@ In psql:
COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
# => COPY 24727350
+In 2021-11-26:
+
+ zcat file_export.json.gz \
+ | pv -l \
+ | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \
+ | sort -S 8G \
+ | uniq -w 40 \
+ | pigz \
+ > fatcat_file.2021-11-26.tsv.gz
+
+ COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 112086814
+
## `file_meta`
zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv
diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql
index 1028c13..a7d6c2b 100644
--- a/sql/dump_file_meta.sql
+++ b/sql/dump_file_meta.sql
@@ -6,7 +6,7 @@ COPY (
FROM file_meta
ORDER BY sha1hex ASC
)
-TO '/grande/snapshots/file_meta_dump.tsv'
+TO '/srv/sandcrawler/tasks/file_meta_dump.tsv'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql
index 3ca8085..e7c48f3 100644
--- a/sql/dump_regrobid_pdf_petabox.sql
+++ b/sql/dump_regrobid_pdf_petabox.sql
@@ -9,7 +9,7 @@ COPY (
SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox
WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
)
-TO '/grande/snapshots/dump_regrobid_pdf_petabox.2020-02-03.json'
+TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql
new file mode 100644
index 0000000..698db7a
--- /dev/null
+++ b/sql/dump_reingest_bulk.sql
@@ -0,0 +1,31 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html')
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '24 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '181 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status like 'cdx-error'
+ OR ingest_file_result.status like 'petabox-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_old.sql b/sql/dump_reingest_old.sql
new file mode 100644
index 0000000..7473420
--- /dev/null
+++ b/sql/dump_reingest_old.sql
@@ -0,0 +1,36 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '6 day'::INTERVAL
+ -- AND ingest_request.created > NOW() - '181 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
+ OR ingest_request.ingest_request_source = 'unpaywall'
+ OR ingest_request.ingest_request_source = 'arxiv'
+ OR ingest_request.ingest_request_source = 'pmc'
+ OR ingest_request.ingest_request_source = 'doaj'
+ OR ingest_request.ingest_request_source = 'dblp')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status like 'no-capture'
+ -- OR ingest_file_result.status like 'cdx-error'
+ -- OR ingest_file_result.status like 'petabox-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql
index 303824b..dbeb199 100644
--- a/sql/dump_reingest_quarterly.sql
+++ b/sql/dump_reingest_quarterly.sql
@@ -1,78 +1,47 @@
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '91 day'::INTERVAL
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'spn2-%'
- AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
- AND ingest_file_result.status != 'spn2-error:filesize-limit'
- AND ingest_file_result.status != 'spn2-wayback-error'
-) TO '/grande/snapshots/reingest_quarterly_spn2-error_current.rows.json';
-
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'cdx-error'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '91 day'::INTERVAL
- AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_cdx-error_current.rows.json';
-
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'cdx-error'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '91 day'::INTERVAL
- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
- AND ingest_request.ingest_request_source != 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json';
-
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'wayback-error'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '91 day'::INTERVAL
-) TO '/grande/snapshots/reingest_quarterly_wayback-error_current.rows.json';
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'gateway-timeout'
AND ingest_request.created < NOW() - '8 hour'::INTERVAL
AND ingest_request.created > NOW() - '91 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_gateway-timeout.rows.json';
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
+ OR ingest_request.ingest_request_source = 'unpaywall'
+ OR ingest_request.ingest_request_source = 'arxiv'
+ OR ingest_request.ingest_request_source = 'pmc'
+ OR ingest_request.ingest_request_source = 'doaj'
+ OR ingest_request.ingest_request_source = 'dblp')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_current.rows.json';
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'petabox-error'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '91 day'::INTERVAL
- AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_petabox-error_current.rows.json';
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ROLLBACK;
diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql
new file mode 100644
index 0000000..a83125c
--- /dev/null
+++ b/sql/dump_reingest_spn.sql
@@ -0,0 +1,36 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '6 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '180 day'::INTERVAL
+ AND ingest_request.ingest_request_source = 'savepapernow-web'
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status = 'cdx-error'
+ -- OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ -- OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql
new file mode 100644
index 0000000..b72a096
--- /dev/null
+++ b/sql/dump_reingest_terminalstatus.sql
@@ -0,0 +1,34 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '72 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '10 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 404
+ )
+ AND (
+ ingest_request.base_url LIKE 'https://doi.org/10.3390/%'
+ OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%'
+ OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%'
+ )
+) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json';
+
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+
+ROLLBACK;
diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql
index 28547a4..a019938 100644
--- a/sql/dump_reingest_weekly.sql
+++ b/sql/dump_reingest_weekly.sql
@@ -1,78 +1,42 @@
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '8 day'::INTERVAL
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'spn2-%'
- AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
- AND ingest_file_result.status != 'spn2-error:filesize-limit'
- AND ingest_file_result.status != 'spn2-wayback-error'
-) TO '/grande/snapshots/reingest_weekly_spn2-error_current.rows.json';
-
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'cdx-error'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '8 day'::INTERVAL
- AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_cdx-error_current.rows.json';
-
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'cdx-error'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '8 day'::INTERVAL
- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
- AND ingest_request.ingest_request_source != 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json';
-
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'wayback-error'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '8 day'::INTERVAL
-) TO '/grande/snapshots/reingest_weekly_wayback-error_current.rows.json';
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'gateway-timeout'
AND ingest_request.created < NOW() - '8 hour'::INTERVAL
AND ingest_request.created > NOW() - '8 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_gateway-timeout.rows.json';
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status = 'cdx-error'
+ -- OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ -- OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json';
-COPY (
- SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
- AND ingest_file_result.ingest_type = 'pdf'
- AND ingest_file_result.hit = false
- AND ingest_file_result.status like 'petabox-error'
- AND ingest_request.created < NOW() - '8 hour'::INTERVAL
- AND ingest_request.created > NOW() - '8 day'::INTERVAL
- AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_petabox-error_current.rows.json';
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ROLLBACK;
diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql
index fb4b0af..a7fb920 100644
--- a/sql/dump_unextracted_pdf.sql
+++ b/sql/dump_unextracted_pdf.sql
@@ -16,7 +16,7 @@ COPY (
AND ingest_file_result.terminal_sha1hex IS NOT NULL
AND pdf_meta.sha1hex IS NULL
)
-TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql
index 7db34fb..bb9f162 100644
--- a/sql/dump_unextracted_pdf_petabox.sql
+++ b/sql/dump_unextracted_pdf_petabox.sql
@@ -12,7 +12,7 @@ COPY (
WHERE petabox.sha1hex IS NOT NULL
AND pdf_meta.sha1hex IS NULL
)
-TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-07-22.json'
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql
index e65edd5..81caf18 100644
--- a/sql/dump_ungrobid_pdf.sql
+++ b/sql/dump_ungrobid_pdf.sql
@@ -12,7 +12,7 @@ COPY (
-- uncomment/comment this to control whether only fatcat files are included
--AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex)
)
-TO '/grande/snapshots/dump_ungrobided_pdf.fatcat.2020-08-04.json'
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql
index f758ec2..b7a1db2 100644
--- a/sql/dump_ungrobid_pdf_petabox.sql
+++ b/sql/dump_ungrobid_pdf_petabox.sql
@@ -11,7 +11,7 @@ COPY (
-- uncomment/comment this to control whether only fatcat files are included
AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex)
)
-TO '/grande/snapshots/dump_ungrobided_pdf_petabox.2020-08-04.json'
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql
index d089c7e..333ff7b 100644
--- a/sql/dump_unmatched_glutton_pdf.sql
+++ b/sql/dump_unmatched_glutton_pdf.sql
@@ -12,7 +12,7 @@ COPY (
AND grobid.fatcat_release IS NOT NULL
LIMIT 1000
)
-TO '/grande/snapshots/dump_unmatched_glutton_pdf.2020-06-30.json';
+TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json';
--TO STDOUT
--WITH NULL '';
diff --git a/sql/ingest_again.md b/sql/ingest_again.md
index 3b4b990..b749557 100644
--- a/sql/ingest_again.md
+++ b/sql/ingest_again.md
@@ -12,7 +12,7 @@
AND ingest_file_result.status like 'spn2-%'
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
- ) TO '/grande/snapshots/reingest_spn2-error_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -25,7 +25,7 @@
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
- ) TO '/grande/snapshots/reingest_cdx-error_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -38,7 +38,7 @@
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source != 'fatcat-changelog'
AND ingest_request.ingest_request_source != 'fatcat-ingest')
- ) TO '/grande/snapshots/reingest_cdx-error_bulk_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -49,7 +49,7 @@
AND ingest_file_result.status like 'wayback-error'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
- ) TO '/grande/snapshots/reingest_wayback-error_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -62,7 +62,7 @@
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
- ) TO '/grande/snapshots/reingest_gateway-timeout.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -75,16 +75,16 @@
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
- ) TO '/grande/snapshots/reingest_petabox-error_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json';
Transform:
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
Push to kafka (shuffled):
@@ -122,10 +122,10 @@ Push to kafka (not shuffled):
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
AND ingest_request.ingest_request_source = 'fatcat-ingest'
- ) TO '/grande/snapshots/reingest_fatcat_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json';
# note: shuf
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 73bd7f1..33dba66 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -42,7 +42,9 @@ CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex);
CREATE TABLE IF NOT EXISTS fatcat_file (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
file_ident TEXT CHECK (octet_length(file_ident) = 26),
- first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
+ first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26),
+ any_url BOOLEAN,
+ content_scope TEXT CHECK (octet_length(content_scope) >= 1)
);
CREATE TABLE IF NOT EXISTS petabox (
@@ -147,6 +149,7 @@ CREATE TABLE IF NOT EXISTS ingest_request (
PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
);
CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
+CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created);
CREATE TABLE IF NOT EXISTS ingest_file_result (
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
@@ -154,7 +157,7 @@ CREATE TABLE IF NOT EXISTS ingest_file_result (
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
hit BOOLEAN NOT NULL,
- status TEXT CHECK (octet_length(terminal_url) >= 1),
+ status TEXT CHECK (octet_length(status) >= 1),
terminal_url TEXT CHECK (octet_length(terminal_url) >= 1),
terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14),
terminal_status_code INT,
@@ -165,6 +168,43 @@ CREATE TABLE IF NOT EXISTS ingest_file_result (
CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url);
CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex);
+CREATE TABLE IF NOT EXISTS ingest_fileset_platform (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+
+ platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1),
+ platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1),
+ platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1),
+ ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1),
+ total_size BIGINT,
+ file_count BIGINT,
+ archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1),
+
+ archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1),
+ web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1),
+ web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14),
+
+ manifest JSONB,
+ -- list, similar to fatcat fileset manifest, plus extra:
+ -- status (str)
+ -- path (str)
+ -- size (int)
+ -- md5 (str)
+ -- sha1 (str)
+ -- sha256 (str)
+ -- mimetype (str)
+ -- extra (dict)
+ -- platform_url (str)
+ -- terminal_url (str)
+ -- terminal_dt (str)
+
+ PRIMARY KEY (ingest_type, base_url)
+);
+CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id);
+
CREATE TABLE IF NOT EXISTS shadow (
shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
@@ -175,3 +215,31 @@ CREATE TABLE IF NOT EXISTS shadow (
PRIMARY KEY(shadow_corpus, shadow_id)
);
CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
+
+CREATE TABLE IF NOT EXISTS crossref (
+ doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)),
+ indexed TIMESTAMP WITH TIME ZONE NOT NULL,
+ record JSON NOT NULL,
+ PRIMARY KEY(doi)
+);
+
+CREATE TABLE IF NOT EXISTS grobid_refs (
+ source TEXT NOT NULL CHECK (octet_length(source) >= 1),
+ source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1),
+ source_ts TIMESTAMP WITH TIME ZONE,
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ refs_json JSON NOT NULL,
+ PRIMARY KEY(source, source_id)
+);
+
+CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS
+ SELECT
+ crossref.doi as doi,
+ crossref.indexed as indexed,
+ crossref.record as record,
+ grobid_refs.source_ts as source_ts,
+ grobid_refs.refs_json as refs_json
+ FROM crossref
+ LEFT JOIN grobid_refs ON
+ grobid_refs.source_id = crossref.doi
+ AND grobid_refs.source = 'crossref';
diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md
index cf3b190..0859e79 100644
--- a/sql/monitoring_queries.md
+++ b/sql/monitoring_queries.md
@@ -168,3 +168,35 @@ Overall status, updated requests past 3 days:
GROUP BY ingest_request.ingest_type, ingest_file_result.status
ORDER BY COUNT(*) DESC;
+## savepapernow and fatcat-ingest recent status
+
+Specific recent ingests (for debugging):
+
+ -- for record layout: \x
+ SELECT
+ ingest_file_result.status as status,
+ ingest_request.ingest_type as ingest_type,
+ ingest_request.ingest_request_source as source,
+ ingest_request.link_source_id as source_id,
+ ingest_request.base_url as base_url,
+ ingest_file_result.terminal_dt as dt,
+ ingest_file_result.terminal_status_code as status_code,
+ ingest_file_result.terminal_sha1hex as sha1hex,
+ grobid.status as grobid_status
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid
+ ON ingest_file_result.terminal_sha1hex = grobid.sha1hex
+ WHERE
+ ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
+ -- AND ingest_request.ingest_type = 'pdf'
+ -- AND ingest_request.ingest_type = 'html'
+ AND (
+ ingest_request.ingest_request_source = 'savepapernow-web'
+ -- OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ )
+ ORDER BY ingest_file_result.updated DESC
+ LIMIT 100;
+
diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh
new file mode 100755
index 0000000..d39a171
--- /dev/null
+++ b/sql/reingest_bulk.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_bulk_current.json
+
+cat /srv/sandcrawler/tasks/reingest_bulk_current.json \
+ | shuf \
+ | head -n1000000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/sql/reingest_old.sh b/sql/reingest_old.sh
new file mode 100755
index 0000000..96e5416
--- /dev/null
+++ b/sql/reingest_old.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_old.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_old_current.json
+
+cat /srv/sandcrawler/tasks/reingest_old_current.json \
+ | shuf \
+ | head -n1000000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
index 44a22b3..8a2996c 100755
--- a/sql/reingest_quarterly.sh
+++ b/sql/reingest_quarterly.sh
@@ -7,14 +7,13 @@ set -o pipefail # fail if part of a '|' command fails
sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql
cd ../python
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_spn2-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_wayback-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_quarterly_gateway-timeout.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_petabox-error_current.json
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_quarterly_current.json
-cat /grande/snapshots/reingest_quarterly_spn2-error_current.json /grande/snapshots/reingest_quarterly_cdx-error_current.json /grande/snapshots/reingest_quarterly_wayback-error_current.json /grande/snapshots/reingest_quarterly_petabox-error_current.json /grande/snapshots/reingest_quarterly_gateway-timeout.json | shuf | head -n100000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
-
-cat /grande/snapshots/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \
+ | shuf \
+ | head -n120000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh
new file mode 100755
index 0000000..c693a64
--- /dev/null
+++ b/sql/reingest_spn.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_spn.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \
+ > /srv/sandcrawler/tasks/reingest_spn.json
+
+cat /srv/sandcrawler/tasks/reingest_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh
new file mode 100755
index 0000000..5cb6d51
--- /dev/null
+++ b/sql/reingest_terminalstatus_forcerecrawl.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json
+
+cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \
+ | shuf \
+ | head -n100000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
index dfd4869..d2e2444 100755
--- a/sql/reingest_weekly.sh
+++ b/sql/reingest_weekly.sh
@@ -7,14 +7,13 @@ set -o pipefail # fail if part of a '|' command fails
sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql
cd ../python
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_weekly_current.json
-cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | head -n40000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
-
-cat /grande/snapshots/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+cat /srv/sandcrawler/tasks/reingest_weekly_current.json \
+ | shuf \
+ | head -n80000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt
new file mode 100644
index 0000000..fca76b9
--- /dev/null
+++ b/sql/stats/2021-04-07_stats.txt
@@ -0,0 +1,430 @@
+
+## SQL Table Sizes
+
+ Size: 551.34G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 49 GB | 50 GB | 100 GB
+ "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB
+ "public"."ingest_request" | 39 GB | 45 GB | 83 GB
+ "public"."grobid" | 70 GB | 8613 MB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB
+ "public"."file_meta" | 35 GB | 31 GB | 66 GB
+ "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB
+ "public"."shadow" | 9517 MB | 10 GB | 20 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."html_meta" | 1172 MB | 10 MB | 1182 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (12 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 174200807 | 234313766162033
+ (1 row)
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 173816433
+ application/octet-stream | 155534
+ text/html | 115821
+ application/xml | 42170
+ application/xhtml+xml | 24347
+ text/plain | 15990
+ application/jats+xml | 6899
+ application/gzip | 6491
+ | 6034
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ application/x-bzip2 | 891
+ image/jpeg | 721
+ image/gif | 389
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297
+ application/x-compress | 272
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ image/png | 88
+ application/mac-binhex40 | 79
+ application/x-dosexec | 51
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ text/rtf | 33
+ application/x-dvi | 29
+ application/x-rar | 29
+ application/vnd.ms-excel | 28
+ message/rfc822 | 26
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62271
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 113880640 | 141793694
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 131346703
+ warc/revisit | 8394443
+ text/xml | 525481
+ application/octet-stream | 502400
+ text/html | 417579
+ unk | 186703
+ application/postscript | 81095
+ application/save | 80915
+ binary/octet-stream | 66698
+ application/x-download | 35771
+ text/plain | 35606
+ image/pdf | 33904
+ application/download | 29701
+ application/force-download | 16726
+ multipart/form-data | 6878
+ application/x-msdownload | 3843
+ application | 3724
+ application/x-octetstream | 3550
+ .pdf | 3138
+ application/x-pdf | 2780
+ application/binary | 1332
+ pdf | 1247
+ file/unknown | 1200
+ application/pdf' | 1192
+ file | 1108
+ application/unknown | 978
+ application/octetstream | 856
+ application/blob | 673
+ text/pdf | 672
+ 0 | 546
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+ total_files | unique_releases
+ -------------+-----------------
+ 105594307 | 19594878
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+----------
+ 200 | 97714631
+ 500 | 7875192
+ -4 | 4772
+ 503 | 520
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 84822508
+ | 12892147
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 31772942
+ pdf | doi | 23528817
+ pdf | doaj | 4264610
+ html | doaj | 2429003
+ pdf | pmc | 2277417
+ pdf | arxiv | 2143549
+ xml | doaj | 9442
+ html | doi | 3022
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 469
+ html | spn | 9
+ (14 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 31772942
+ pdf | doi | fatcat-changelog | 11010764
+ pdf | doi | fatcat-ingest | 9002119
+ pdf | doaj | doaj | 4264610
+ pdf | doi | fatcat-ingest-container | 3515873
+ html | doaj | doaj | 2429003
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1767703
+ pdf | arxiv | fatcat-changelog | 375818
+ pdf | pmc | fatcat-ingest | 211264
+ pdf | pmc | fatcat-changelog | 37328
+ xml | doaj | doaj | 9442
+ html | doi | fatcat-ingest | 3018
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 469
+ pdf | doi | savepapernow-web | 74
+ pdf | arxiv | fatcat-ingest-container | 26
+ html | spn | savepapernow-web | 9
+ html | doi | savepapernow-web | 4
+ pdf | arxiv | savepapernow-web | 2
+ (23 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 168462
+ pdf | oai | 15286
+ pdf | doaj | 2068
+ html | doaj | 620
+ pdf | unpaywall | 13
+ (5 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14163500 | 0.277
+ pdf | mag | 35015357 | 24818176 | 0.709
+ pdf | unpaywall | 31772942 | 25018501 | 0.787
+ pdf | doi | 23529041 | 5773728 | 0.245
+ pdf | doaj | 4264610 | 2851328 | 0.669
+ html | doaj | 2429003 | 122937 | 0.051
+ pdf | pmc | 2277417 | 1736491 | 0.762
+ pdf | arxiv | 2143549 | 2011378 | 0.938
+ xml | doaj | 9442 | 6897 | 0.730
+ html | doi | 3022 | 957 | 0.317
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 469 | 328 | 0.699
+ html | spn | 9 | 2 | 0.222
+ (14 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+--------------------------------+----------
+ pdf | success | 66487928
+ pdf | no-pdf-link | 29279677
+ pdf | no-capture | 22765431
+ pdf | redirect-loop | 9155767
+ pdf | terminal-bad-status | 3549665
+ pdf | link-loop | 2592983
+ html | wrong-scope | 1088793
+ pdf | wrong-mimetype | 792563
+ pdf | gateway-timeout | 478181
+ html | no-capture | 423917
+ pdf | wayback-content-error | 355828
+ pdf | cdx-error | 343862
+ pdf | null-body | 328774
+ pdf | forbidden | 286647
+ pdf | spn2-cdx-lookup-failure | 276769
+ pdf | spn2-wayback-error | 276080
+ pdf | skip-url-blocklist | 265473
+ html | redirect-loop | 212916
+ pdf | not-found | 204367
+ html | unknown-scope | 204112
+ html | html-resource-no-capture | 166034
+ pdf | blocked-cookie | 160336
+ pdf | too-many-redirects | 152984
+ html | success | 123896
+ pdf | wayback-error | 114388
+ html | null-body | 100296
+ pdf | spn2-error:too-many-redirects | 58336
+ html | wayback-content-error | 53926
+ pdf | invalid-host-resolution | 37226
+ pdf | petabox-error | 37177
+ pdf | remote-server-error | 36439
+ pdf | spn2-error | 27556
+ pdf | spn2-error:proxy-error | 25486
+ pdf | read-timeout | 20745
+ html | wrong-mimetype | 18928
+ html | terminal-bad-status | 14059
+ html | petabox-error | 13533
+ pdf | bad-redirect | 7535
+ xml | success | 6897
+ html | cdx-error | 6823
+ pdf | spn2-error:bad-request | 4664
+ pdf | spn2-error:unauthorized | 4391
+ pdf | spn-remote-error | 4206
+ pdf | spn2-error:service-unavailable | 2614
+ pdf | spn2-error:job-failed | 2562
+ xml | null-body | 2353
+ pdf | other-mimetype | 2304
+ pdf | error | 1905
+ html | spn2-cdx-lookup-failure | 1018
+ pdf | redirects-exceeded | 1015
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 36515867
+ pdf | | 22909334
+ pdf | 301 | 7969702
+ html | 200 | 1653303
+ pdf | 503 | 928507
+ pdf | 403 | 823755
+ pdf | 302 | 792842
+ pdf | 400 | 462108
+ html | | 426474
+ pdf | 404 | 422163
+ pdf | 401 | 270611
+ pdf | 500 | 248675
+ html | 301 | 211713
+ pdf | 303 | 109686
+ pdf | 410 | 50648
+ pdf | 502 | 37663
+ pdf | 429 | 31982
+ pdf | 420 | 26603
+ pdf | 509 | 15113
+ pdf | 409 | 14835
+ html | 404 | 9573
+ pdf | 999 | 9296
+ pdf | 307 | 3972
+ pdf | 308 | 3914
+ html | 500 | 3625
+ pdf | 202 | 3515
+ xml | 200 | 2537
+ pdf | 520 | 2072
+ pdf | 206 | 1665
+ pdf | 521 | 1075
+ html | 302 | 1072
+ pdf | 504 | 1000
+ pdf | 412 | 476
+ pdf | 300 | 434
+ pdf | 505 | 429
+ pdf | 406 | 393
+ html | 403 | 382
+ html | 503 | 378
+ pdf | 421 | 298
+ html | 303 | 268
+ pdf | 508 | 195
+ pdf | 226 | 166
+ pdf | 402 | 70
+ html | 502 | 68
+ pdf | 408 | 50
+ pdf | 204 | 34
+ pdf | 416 | 29
+ pdf | 501 | 29
+ pdf | 530 | 27
+ pdf | 507 | 21
+ (50 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | release_count
+ -------------+---------------
+ 8514315 | 6401104
+ (1 row)
diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt
new file mode 100644
index 0000000..a8a9cd5
--- /dev/null
+++ b/sql/stats/2021-04-08_table_sizes.txt
@@ -0,0 +1,40 @@
+
+## SQL Table Sizes
+
+ Size: 467.23G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 49 GB | 26 GB | 76 GB
+ "public"."grobid" | 69 GB | 6834 MB | 75 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."ingest_request" | 39 GB | 32 GB | 70 GB
+ "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB
+ "public"."file_meta" | 32 GB | 21 GB | 53 GB
+ "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB
+ "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ (12 rows)
+
diff --git a/sql/stats/2021-04-12_ingest_domain_summary_30d.txt b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt
new file mode 100644
index 0000000..6811b54
--- /dev/null
+++ b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt
@@ -0,0 +1,345 @@
+ domain | status | count
+---------------------------------------+-------------------------+--------
+ academic.oup.com | | 4105
+ academic.oup.com | spn2-wayback-error | 1393
+ academic.oup.com | link-loop | 1025
+ academic.oup.com | no-pdf-link | 1020
+ academic.oup.com | spn2-cdx-lookup-failure | 512
+ acervus.unicamp.br | | 1967
+ acervus.unicamp.br | no-pdf-link | 1853
+ acp.copernicus.org | | 620
+ acp.copernicus.org | success | 537
+ aip.scitation.org | | 1310
+ aip.scitation.org | blocked-cookie | 1192
+ alustath.uobaghdad.edu.iq | | 697
+ alustath.uobaghdad.edu.iq | success | 550
+ apex.ipk-gatersleben.de | | 1253
+ apex.ipk-gatersleben.de | no-pdf-link | 1132
+ apps.crossref.org | | 4693
+ apps.crossref.org | no-pdf-link | 4075
+ arxiv.org | | 14990
+ arxiv.org | success | 12899
+ arxiv.org | spn2-wayback-error | 1592
+ ashpublications.org | | 563
+ asmedigitalcollection.asme.org | | 3990
+ asmedigitalcollection.asme.org | spn2-cdx-lookup-failure | 1570
+ asmedigitalcollection.asme.org | no-pdf-link | 1449
+ asmedigitalcollection.asme.org | link-loop | 734
+ assets.researchsquare.com | | 8217
+ assets.researchsquare.com | success | 7116
+ assets.researchsquare.com | spn2-wayback-error | 946
+ av.tib.eu | | 526
+ bioone.org | | 588
+ books.openedition.org | | 1784
+ books.openedition.org | no-pdf-link | 1466
+ boris.unibe.ch | | 1420
+ boris.unibe.ch | success | 743
+ brill.com | | 1773
+ brill.com | link-loop | 879
+ chemrxiv.org | | 857
+ chemrxiv.org | no-pdf-link | 519
+ classiques-garnier.com | | 1072
+ classiques-garnier.com | success | 807
+ content.iospress.com | | 793
+ content.iospress.com | link-loop | 568
+ cyberdoi.ru | | 775
+ cyberdoi.ru | redirect-loop | 775
+ cyberleninka.ru | | 1453
+ cyberleninka.ru | success | 1092
+ d197for5662m48.cloudfront.net | | 632
+ d197for5662m48.cloudfront.net | success | 544
+ dergipark.org.tr | | 3070
+ dergipark.org.tr | success | 1251
+ dergipark.org.tr | no-pdf-link | 843
+ dergipark.org.tr | spn2-wayback-error | 677
+ digi.ub.uni-heidelberg.de | | 502
+ dione.lib.unipi.gr | | 783
+ direct.mit.edu | | 996
+ direct.mit.edu | no-pdf-link | 869
+ dl.acm.org | | 1692
+ dl.acm.org | blocked-cookie | 1558
+ dlc.library.columbia.edu | | 4225
+ dlc.library.columbia.edu | no-pdf-link | 2395
+ dlc.library.columbia.edu | spn2-wayback-error | 1568
+ doi.ala.org.au | | 2570
+ doi.ala.org.au | no-pdf-link | 2153
+ doi.nrct.go.th | | 566
+ doi.org | | 10408
+ doi.org | spn2-cdx-lookup-failure | 9593
+ doi.org | terminal-bad-status | 741
+ downloads.hindawi.com | | 2137
+ downloads.hindawi.com | success | 1787
+ dram.journals.ekb.eg | | 541
+ elib.spbstu.ru | | 1243
+ elib.spbstu.ru | redirect-loop | 1214
+ elibrary.vdi-verlag.de | | 1542
+ elibrary.vdi-verlag.de | spn2-wayback-error | 721
+ elifesciences.org | | 689
+ elifesciences.org | success | 521
+ epos.myesr.org | | 705
+ epos.myesr.org | spn2-wayback-error | 604
+ europepmc.org | | 6996
+ europepmc.org | success | 6031
+ europepmc.org | spn2-wayback-error | 756
+ figshare.com | | 1168
+ figshare.com | no-pdf-link | 726
+ files.osf.io | | 1526
+ files.osf.io | success | 1078
+ fjfsdata01prod.blob.core.windows.net | | 5410
+ fjfsdata01prod.blob.core.windows.net | success | 4581
+ fjfsdata01prod.blob.core.windows.net | spn2-wayback-error | 587
+ fldeploc.dep.state.fl.us | | 774
+ fldeploc.dep.state.fl.us | no-pdf-link | 718
+ geoscan.nrcan.gc.ca | | 2056
+ geoscan.nrcan.gc.ca | no-pdf-link | 2019
+ hcommons.org | | 1593
+ hcommons.org | success | 1333
+ hkvalidate.perfdrive.com | | 1322
+ hkvalidate.perfdrive.com | no-pdf-link | 1083
+ ieeexplore.ieee.org | | 20997
+ ieeexplore.ieee.org | too-many-redirects | 15383
+ ieeexplore.ieee.org | spn2-wayback-error | 2555
+ ieeexplore.ieee.org | success | 2165
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 747
+ jamanetwork.com | | 712
+ journals.aps.org | | 1698
+ journals.aps.org | not-found | 1469
+ journals.library.ualberta.ca | | 733
+ journals.library.ualberta.ca | success | 594
+ journals.lww.com | | 6606
+ journals.lww.com | link-loop | 3102
+ journals.lww.com | spn2-wayback-error | 1645
+ journals.lww.com | terminal-bad-status | 965
+ journals.lww.com | spn2-cdx-lookup-failure | 552
+ journals.openedition.org | | 4594
+ journals.openedition.org | success | 1441
+ journals.openedition.org | redirect-loop | 1316
+ journals.openedition.org | spn2-wayback-error | 1197
+ journals.ub.uni-heidelberg.de | | 1039
+ journals.ub.uni-heidelberg.de | success | 728
+ kiss.kstudy.com | | 747
+ kiss.kstudy.com | no-pdf-link | 686
+ library.iated.org | | 1560
+ library.iated.org | redirect-loop | 1148
+ linkinghub.elsevier.com | | 5079
+ linkinghub.elsevier.com | forbidden | 2226
+ linkinghub.elsevier.com | spn2-wayback-error | 1625
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758
+ mr.crossref.org | | 542
+ nsuworks.nova.edu | | 843
+ nsuworks.nova.edu | success | 746
+ ojs.cvut.cz | | 805
+ ojs.cvut.cz | success | 764
+ ojs.ugent.be | | 867
+ ojs.ugent.be | success | 643
+ onepetro.org | | 603
+ onlinelibrary.wiley.com | | 1203
+ onlinelibrary.wiley.com | blocked-cookie | 758
+ open.library.ubc.ca | | 559
+ osf.io | | 3139
+ osf.io | not-found | 2288
+ osf.io | spn2-wayback-error | 582
+ oxford.universitypressscholarship.com | | 3556
+ oxford.universitypressscholarship.com | link-loop | 2373
+ oxford.universitypressscholarship.com | spn2-wayback-error | 562
+ painphysicianjournal.com | | 804
+ painphysicianjournal.com | success | 668
+ papers.ssrn.com | | 6367
+ papers.ssrn.com | link-loop | 3865
+ papers.ssrn.com | spn2-wayback-error | 1106
+ papers.ssrn.com | spn2-cdx-lookup-failure | 1015
+ peerj.com | | 785
+ peerj.com | no-pdf-link | 552
+ pos.sissa.it | | 1455
+ pos.sissa.it | success | 1153
+ preprints.jmir.org | | 763
+ preprints.jmir.org | no-pdf-link | 611
+ psyarxiv.com | | 641
+ psyarxiv.com | no-pdf-link | 546
+ publikationen.uni-tuebingen.de | | 659
+ publons.com | | 6998
+ publons.com | no-pdf-link | 6982
+ pubs.acs.org | | 5860
+ pubs.acs.org | blocked-cookie | 5185
+ pubs.rsc.org | | 2269
+ pubs.rsc.org | link-loop | 1384
+ res.mdpi.com | | 15776
+ res.mdpi.com | success | 13710
+ res.mdpi.com | spn2-wayback-error | 1424
+ res.mdpi.com | spn2-cdx-lookup-failure | 641
+ rrs.scholasticahq.com | | 1078
+ rrs.scholasticahq.com | success | 803
+ rsdjournal.org | | 755
+ rsdjournal.org | success | 524
+ s3-eu-west-1.amazonaws.com | | 3343
+ s3-eu-west-1.amazonaws.com | success | 2893
+ saemobilus.sae.org | | 795
+ saemobilus.sae.org | no-pdf-link | 669
+ sage.figshare.com | | 725
+ scholar.dkyobobook.co.kr | | 1043
+ scholar.dkyobobook.co.kr | no-pdf-link | 915
+ scholarworks.umass.edu | | 1196
+ scholarworks.umass.edu | success | 713
+ secure.jbs.elsevierhealth.com | | 4202
+ secure.jbs.elsevierhealth.com | blocked-cookie | 4169
+ storage.googleapis.com | | 1720
+ storage.googleapis.com | success | 1466
+ tandf.figshare.com | | 789
+ tandf.figshare.com | no-pdf-link | 640
+ tind-customer-agecon.s3.amazonaws.com | | 584
+ turcomat.org | | 1196
+ turcomat.org | spn2-wayback-error | 997
+ unreserved.rba.gov.au | | 823
+ unreserved.rba.gov.au | no-pdf-link | 821
+ utpjournals.press | | 669
+ utpjournals.press | blocked-cookie | 616
+ watermark.silverchair.com | | 3560
+ watermark.silverchair.com | success | 2788
+ watermark.silverchair.com | spn2-wayback-error | 685
+ wayf.switch.ch | | 1169
+ wayf.switch.ch | no-pdf-link | 809
+ www.ahajournals.org | | 802
+ www.ahajournals.org | blocked-cookie | 597
+ www.ajol.info | | 830
+ www.ajol.info | success | 575
+ www.ams.org | | 868
+ www.ams.org | terminal-bad-status | 666
+ www.atlantis-press.com | | 1579
+ www.atlantis-press.com | success | 1071
+ www.bloomsburycollections.com | | 1745
+ www.bloomsburycollections.com | no-pdf-link | 1571
+ www.brazilianjournals.com | | 1385
+ www.brazilianjournals.com | success | 1107
+ www.cairn.info | | 2479
+ www.cairn.info | no-pdf-link | 818
+ www.cairn.info | link-loop | 790
+ www.cambridge.org | | 6801
+ www.cambridge.org | no-pdf-link | 2990
+ www.cambridge.org | spn2-wayback-error | 1475
+ www.cambridge.org | link-loop | 940
+ www.cambridge.org | success | 863
+ www.cureus.com | | 538
+ www.dbpia.co.kr | | 2958
+ www.dbpia.co.kr | redirect-loop | 2953
+ www.degruyter.com | | 58612
+ www.degruyter.com | no-pdf-link | 41065
+ www.degruyter.com | spn2-wayback-error | 7426
+ www.degruyter.com | success | 6628
+ www.degruyter.com | spn2-cdx-lookup-failure | 1624
+ www.degruyter.com | terminal-bad-status | 1565
+ www.dovepress.com | | 869
+ www.dovepress.com | success | 597
+ www.e-manuscripta.ch | | 1047
+ www.e3s-conferences.org | | 817
+ www.e3s-conferences.org | success | 606
+ www.elgaronline.com | | 535
+ www.elibrary.ru | | 1244
+ www.elibrary.ru | no-pdf-link | 1159
+ www.emc2020.eu | | 791
+ www.emc2020.eu | no-pdf-link | 748
+ www.emerald.com | | 2420
+ www.emerald.com | no-pdf-link | 1986
+ www.eurekaselect.com | | 540
+ www.eurosurveillance.org | | 786
+ www.eurosurveillance.org | success | 710
+ www.finersistemas.com | | 1220
+ www.finersistemas.com | success | 1214
+ www.frontiersin.org | | 915
+ www.frontiersin.org | spn2-wayback-error | 602
+ www.hanspub.org | | 618
+ www.humankineticslibrary.com | | 1122
+ www.humankineticslibrary.com | no-pdf-link | 985
+ www.ijcmas.com | | 513
+ www.inderscience.com | | 1532
+ www.inderscience.com | no-pdf-link | 1217
+ www.indianjournals.com | | 904
+ www.ingentaconnect.com | | 885
+ www.ingentaconnect.com | no-pdf-link | 783
+ www.journals.uchicago.edu | | 6055
+ www.journals.uchicago.edu | blocked-cookie | 5927
+ www.journals.vu.lt | | 791
+ www.journals.vu.lt | success | 545
+ www.jstage.jst.go.jp | | 1490
+ www.jstage.jst.go.jp | remote-server-error | 1023
+ www.jstor.org | | 1103
+ www.jstor.org | redirect-loop | 553
+ www.karger.com | | 733
+ www.liebertpub.com | | 804
+ www.liebertpub.com | blocked-cookie | 714
+ www.liverpooluniversitypress.co.uk | | 620
+ www.liverpooluniversitypress.co.uk | too-many-redirects | 529
+ www.mdpi.com | | 3880
+ www.mdpi.com | spn2-wayback-error | 1651
+ www.mdpi.com | forbidden | 1282
+ www.mdpi.com | spn2-cdx-lookup-failure | 714
+ www.nepjol.info | | 596
+ www.nomos-elibrary.de | | 2235
+ www.nomos-elibrary.de | no-pdf-link | 1128
+ www.nomos-elibrary.de | spn2-wayback-error | 559
+ www.oecd-ilibrary.org | | 3046
+ www.oecd-ilibrary.org | no-pdf-link | 2869
+ www.osapublishing.org | | 821
+ www.osapublishing.org | no-pdf-link | 615
+ www.osti.gov | | 1147
+ www.osti.gov | link-loop | 902
+ www.oxfordscholarlyeditions.com | | 759
+ www.oxfordscholarlyeditions.com | no-pdf-link | 719
+ www.preprints.org | | 783
+ www.preprints.org | success | 595
+ www.repository.cam.ac.uk | | 1146
+ www.research-collection.ethz.ch | | 704
+ www.research-collection.ethz.ch | terminal-bad-status | 684
+ www.researchsquare.com | | 853
+ www.researchsquare.com | spn2-wayback-error | 515
+ www.schweizerbart.de | | 730
+ www.schweizerbart.de | no-pdf-link | 653
+ www.scielo.br | | 1777
+ www.scielo.br | success | 1167
+ www.sciencedirect.com | | 14757
+ www.sciencedirect.com | no-pdf-link | 12733
+ www.sciencedirect.com | spn2-wayback-error | 1503
+ www.sciendo.com | | 1955
+ www.sciendo.com | no-pdf-link | 1176
+ www.scilook.eu | | 812
+ www.scilook.eu | success | 563
+ www.scirp.org | | 749
+ www.tandfonline.com | | 11038
+ www.tandfonline.com | blocked-cookie | 9994
+ www.tandfonline.com | no-pdf-link | 663
+ www.taylorfrancis.com | | 71514
+ www.taylorfrancis.com | spn2-wayback-error | 36663
+ www.taylorfrancis.com | no-pdf-link | 15098
+ www.taylorfrancis.com | forbidden | 8699
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 6894
+ www.taylorfrancis.com | link-loop | 3661
+ www.thieme-connect.de | | 3687
+ www.thieme-connect.de | redirect-loop | 1187
+ www.thieme-connect.de | not-found | 945
+ www.thieme-connect.de | no-pdf-link | 941
+ www.worldscientific.com | | 1476
+ www.worldscientific.com | blocked-cookie | 1323
+ www.zora.uzh.ch | | 1118
+ zenodo.org | | 43010
+ zenodo.org | no-pdf-link | 22015
+ zenodo.org | success | 12747
+ zenodo.org | spn2-wayback-error | 4608
+ zenodo.org | spn2-cdx-lookup-failure | 3215
+ | | 725990
+ | no-pdf-link | 209933
+ | success | 206134
+ | spn2-wayback-error | 127015
+ | spn2-cdx-lookup-failure | 53384
+ | blocked-cookie | 35867
+ | link-loop | 25834
+ | too-many-redirects | 16430
+ | redirect-loop | 14648
+ | forbidden | 13794
+ | terminal-bad-status | 8055
+ | not-found | 6399
+ | remote-server-error | 2402
+ | wrong-mimetype | 2011
+ | spn2-error:unauthorized | 912
+ | bad-redirect | 555
+ | read-timeout | 530
+(341 rows)
+
diff --git a/sql/stats/2021-11-01_table_sizes.txt b/sql/stats/2021-11-01_table_sizes.txt
new file mode 100644
index 0000000..57f7e57
--- /dev/null
+++ b/sql/stats/2021-11-01_table_sizes.txt
@@ -0,0 +1,19 @@
+
+Size: 832.66G
+
+ table_name | table_size | indexes_size | total_size
+-------------------------------+------------+--------------+------------
+ "public"."crossref" | 311 GB | 9812 MB | 320 GB
+ "public"."ingest_request" | 44 GB | 40 GB | 84 GB
+ "public"."cdx" | 52 GB | 28 GB | 80 GB
+ "public"."grobid" | 72 GB | 6952 MB | 79 GB
+ "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 34 GB | 21 GB | 54 GB
+ "public"."pdf_meta" | 20 GB | 5813 MB | 26 GB
+ "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+(13 rows)
diff --git a/sql/stats/2021-11-26_stats.txt b/sql/stats/2021-11-26_stats.txt
new file mode 100644
index 0000000..3a0e561
--- /dev/null
+++ b/sql/stats/2021-11-26_stats.txt
@@ -0,0 +1,424 @@
+
+Date: Sat 27 Nov 2021 03:33:30 AM UTC
+
+## SQL Table Sizes
+
+ Size: 937.28G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 393 GB | 10127 MB | 403 GB
+ "public"."ingest_request" | 44 GB | 41 GB | 84 GB
+ "public"."cdx" | 52 GB | 28 GB | 80 GB
+ "public"."grobid" | 72 GB | 6963 MB | 79 GB
+ "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 34 GB | 21 GB | 55 GB
+ "public"."pdf_meta" | 20 GB | 5869 MB | 26 GB
+ "public"."grobid_refs" | 19 GB | 1690 MB | 21 GB
+ "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 179761501 | 244453538203113
+
+ # 179m files, 244 TB
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 179376819
+ application/octet-stream | 155379
+ text/html | 116102
+ application/xml | 42170
+ application/xhtml+xml | 24347
+ text/plain | 15990
+ application/jats+xml | 6899
+ application/gzip | 6491
+ | 6034
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ application/x-bzip2 | 891
+ image/jpeg | 794
+ image/gif | 389
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 303
+ application/x-compress | 272
+ application/zip | 131
+ image/png | 121
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 57
+ application/x-dosexec | 51
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ text/rtf | 33
+ application/x-dvi | 29
+ application/x-rar | 29
+ video/mp4 | 29
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62196
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 119049962 | 149169240
+
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 137271670
+ warc/revisit | 9709493
+ application/octet-stream | 590443
+ text/xml | 525481
+ text/html | 421030
+ unk | 207442
+ application/postscript | 81123
+ application/save | 80988
+ binary/octet-stream | 67476
+ image/pdf | 39419
+ application/x-download | 38278
+ text/plain | 36159
+ application/download | 34328
+ application/force-download | 19729
+ multipart/form-data | 9105
+ application | 5299
+ application/x-msdownload | 3851
+ application/x-octetstream | 3649
+ .pdf | 3318
+ application/x-pdf | 2992
+ pdf | 1484
+ file | 1364
+ application/binary | 1354
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1029
+ application/unknown | 1000
+ 0 | 764
+ text/pdf | 704
+ application/blob | 673
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 111236904
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 102962304
+ 500 | 8269129
+ -4 | 5013
+ 503 | 548
+
+TODO: how many failed, by mimetype? to check if we are (or have) run non-PDF
+files through by mistake
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.5.5-fatcat | 89983404
+ | 12892161
+ 0.7.0-104-gbeebd9a6b | 86739
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 43701948
+ pdf | unpaywall | 37802895
+ pdf | doi | 28736398
+ pdf | doaj | 4264610
+ html | doaj | 2429003
+ pdf | pmc | 2383398
+ pdf | arxiv | 2330054
+ html | doi | 39725
+ xml | doaj | 9442
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 689
+ html | spn | 48
+ xml | spn | 1
+ (15 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 43701948
+ pdf | unpaywall | unpaywall | 37802895
+ pdf | doi | fatcat-changelog | 16207728
+ pdf | doi | fatcat-ingest | 9012282
+ pdf | doaj | doaj | 4264610
+ pdf | doi | fatcat-ingest-container | 3515873
+ html | doaj | doaj | 2429003
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1767705
+ pdf | arxiv | fatcat-changelog | 562320
+ pdf | pmc | fatcat-ingest | 297527
+ pdf | pmc | fatcat-changelog | 57046
+ html | doi | fatcat-ingest | 37788
+ xml | doaj | doaj | 9442
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ html | doi | fatcat-changelog | 1897
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 689
+ pdf | doi | savepapernow-web | 613
+ html | spn | savepapernow-web | 48
+ html | doi | savepapernow-web | 40
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 3
+ xml | spn | savepapernow-web | 1
+ (25 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 169076
+ pdf | oai | 15283
+ pdf | doaj | 2063
+ html | doaj | 620
+ pdf | doi | 22
+ pdf | unpaywall | 17
+
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14554221 | 0.284
+ pdf | mag | 43701948 | 32643175 | 0.747
+ pdf | unpaywall | 37802895 | 29989257 | 0.793
+ pdf | doi | 28736547 | 7690393 | 0.268
+ pdf | doaj | 4264610 | 2851601 | 0.669
+ html | doaj | 2429003 | 122937 | 0.051
+ pdf | pmc | 2383398 | 1821071 | 0.764
+ pdf | arxiv | 2330054 | 2159738 | 0.927
+ html | doi | 39725 | 1235 | 0.031
+ xml | doaj | 9442 | 6897 | 0.730
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 689 | 503 | 0.730
+ html | spn | 48 | 5 | 0.104
+ xml | spn | 1 | 0 | 0.000
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+
+ ingest_type | status | count
+ -------------+-------------------------------+----------
+ pdf | success | 78944243
+ pdf | no-pdf-link | 26270027
+ pdf | no-capture | 23267156
+ pdf | redirect-loop | 9837466
+ pdf | terminal-bad-status | 4147454
+ pdf | skip-url-blocklist | 3088907
+ pdf | link-loop | 2953891
+ pdf | blocked-cookie | 1855541
+ html | wrong-scope | 1106171
+ pdf | wrong-mimetype | 859941
+ pdf | gateway-timeout | 729771
+ pdf | spn2-cdx-lookup-failure | 584856
+ html | no-capture | 423917
+ pdf | forbidden | 390804
+ pdf | cdx-error | 363091
+ pdf | wayback-content-error | 354894
+ pdf | null-body | 341698
+ pdf | too-many-redirects | 307096
+ pdf | not-found | 294592
+ html | redirect-loop | 213032
+ html | unknown-scope | 207923
+ pdf | spn2-error | 192046
+ html | html-resource-no-capture | 166119
+ html | success | 124177
+ pdf | wayback-error | 105385
+ html | null-body | 100296
+ pdf | spn2-wayback-error | 73176
+ pdf | remote-server-error | 60908
+ pdf | spn2-error:too-many-redirects | 58076
+ pdf | skip-wall | 57744
+ html | wayback-content-error | 53928
+ pdf | read-timeout | 42465
+ pdf | invalid-host-resolution | 37221
+ pdf | petabox-error | 28765
+ pdf | spn2-error:unknown | 23885
+ html | wrong-mimetype | 18930
+ pdf | bad-redirect | 14708
+ html | terminal-bad-status | 14070
+ html | petabox-error | 13770
+ html | spn2-cdx-lookup-failure | 13002
+ pdf | spn2-error:job-failed | 9721
+ html | cdx-error | 7167
+ xml | success | 6897
+ pdf | spn2-error:bad-request | 4433
+ pdf | spn-remote-error | 4206
+ pdf | body-too-large | 3019
+ xml | null-body | 2353
+ pdf | other-mimetype | 2304
+ pdf | error | 1900
+ pdf | spn2-error:proxy-error | 1850
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 36821458
+ pdf | | 26058729
+ pdf | 301 | 8466302
+ html | 200 | 1676730
+ pdf | 503 | 1028504
+ pdf | 302 | 949465
+ pdf | 403 | 936737
+ pdf | 404 | 687661
+ pdf | 400 | 507303
+ html | | 439356
+ pdf | 401 | 288994
+ pdf | 500 | 263775
+ html | 301 | 211796
+ pdf | 303 | 130719
+ pdf | 410 | 66495
+ pdf | 502 | 41760
+ pdf | 429 | 35266
+ pdf | 420 | 26722
+ pdf | 409 | 15204
+ pdf | 509 | 15113
+ pdf | 999 | 11409
+ html | 404 | 9578
+ pdf | 307 | 8404
+ pdf | 308 | 5514
+ pdf | 202 | 4724
+ html | 500 | 3628
+ xml | 200 | 2537
+ pdf | 520 | 2199
+ pdf | 206 | 1694
+ html | 302 | 1138
+ pdf | 504 | 1124
+ pdf | 521 | 1085
+ pdf | 412 | 921
+ pdf | 421 | 714
+ pdf | 300 | 461
+ pdf | 505 | 436
+ pdf | 406 | 427
+ pdf | 508 | 408
+ html | 403 | 382
+ html | 503 | 378
+ html | 303 | 268
+ pdf | 204 | 252
+ pdf | 226 | 166
+ pdf | 402 | 70
+ html | 502 | 68
+ pdf | 523 | 55
+ pdf | 408 | 53
+ pdf | 432 | 45
+ pdf | 530 | 31
+ pdf | 416 | 31
+ (50 rows)
diff --git a/sql/stats/2021-12-02_table_sizes.txt b/sql/stats/2021-12-02_table_sizes.txt
new file mode 100644
index 0000000..b03c370
--- /dev/null
+++ b/sql/stats/2021-12-02_table_sizes.txt
@@ -0,0 +1,22 @@
+
+Size: 940.66G
+
+ table_name | table_size | indexes_size | total_size
+------------------------------------+------------+--------------+------------
+ "public"."crossref" | 394 GB | 10138 MB | 404 GB
+ "public"."ingest_request" | 44 GB | 41 GB | 85 GB
+ "public"."cdx" | 52 GB | 28 GB | 80 GB
+ "public"."grobid" | 72 GB | 6978 MB | 79 GB
+ "public"."ingest_file_result" | 38 GB | 41 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 34 GB | 21 GB | 55 GB
+ "public"."pdf_meta" | 20 GB | 5930 MB | 26 GB
+ "public"."grobid_refs" | 19 GB | 1752 MB | 21 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+(16 rows)
diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt
new file mode 100644
index 0000000..bd20c5c
--- /dev/null
+++ b/sql/stats/2022-04-26_stats.txt
@@ -0,0 +1,432 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 416 GB | 10 GB | 426 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 58 GB | 41 GB | 99 GB
+ "public"."ingest_request" | 50 GB | 48 GB | 98 GB
+ "public"."ingest_file_result" | 42 GB | 48 GB | 90 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 37 GB | 34 GB | 71 GB
+ "public"."pdf_meta" | 21 GB | 7386 MB | 29 GB
+ "public"."grobid_refs" | 23 GB | 2516 MB | 26 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 3015 MB | 31 MB | 3046 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 192402128 | 271919997557597
+ (1 row)
+
+ # 271,919,997,557,597 -> ~272 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 191760695
+ text/html | 330351
+ application/octet-stream | 186696
+ application/xml | 42170
+ application/xhtml+xml | 31470
+ text/plain | 16449
+ application/jats+xml | 6902
+ application/gzip | 6681
+ | 6033
+ application/postscript | 4916
+ image/jpeg | 2901
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 934
+ application/x-bzip2 | 891
+ image/png | 476
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 374
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 294
+ application/x-compress | 274
+ video/mp4 | 150
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12831
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 130732381 | 162760251
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 149749828
+ warc/revisit | 10437210
+ application/octet-stream | 733161
+ text/html | 642992
+ text/xml | 525483
+ unk | 217642
+ application/postscript | 81127
+ application/save | 81023
+ binary/octet-stream | 67938
+ application/x-download | 41137
+ image/pdf | 39712
+ application/download | 37153
+ text/plain | 36342
+ application/force-download | 21496
+ multipart/form-data | 9792
+ application | 5366
+ application/x-octetstream | 5166
+ application/x-msdownload | 3851
+ .pdf | 3445
+ application/x-pdf | 3018
+ pdf | 1618
+ file | 1370
+ application/binary | 1354
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1047
+ application/unknown | 1001
+ 0 | 773
+ text/pdf | 729
+ application/blob | 673
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 123669603
+ (1 row)
+
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 115668412
+ 500 | 7995428
+ -4 | 5745
+ 503 | 18
+ (4 rows)
+
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 54780825
+ 0.5.5-fatcat | 48003940
+ | 12694404
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | mag | 43701948
+ pdf | doi | 40044585
+ pdf | doaj | 6016771
+ html | doaj | 3648181
+ pdf | arxiv | 2676200
+ pdf | pmc | 2402453
+ html | doi | 41492
+ xml | doaj | 20638
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 829
+ html | spn | 52
+ xml | doi | 1
+ xml | spn | 1
+ (16 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 20936949
+ pdf | doi | fatcat-ingest | 15590201
+ pdf | doaj | doaj | 6016771
+ html | doaj | doaj | 3648181
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 691405
+ pdf | pmc | fatcat-ingest | 297646
+ pdf | pmc | fatcat-changelog | 75982
+ html | doi | fatcat-ingest | 37904
+ xml | doaj | doaj | 20638
+ html | doi | fatcat-changelog | 3534
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1562
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 829
+ html | doi | savepapernow-web | 54
+ html | spn | savepapernow-web | 52
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 3
+ xml | doi | savepapernow-web | 1
+ xml | spn | savepapernow-web | 1
+ (26 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doaj | 1619621
+ html | doaj | 1208412
+ pdf | mag | 167653
+ pdf | oai | 15282
+ xml | doaj | 11196
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (7 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 15968290 | 0.312
+ pdf | unpaywall | 43932525 | 32618045 | 0.742
+ pdf | mag | 43701948 | 32662926 | 0.747
+ pdf | doi | 40044738 | 10925369 | 0.273
+ pdf | doaj | 6016771 | 3042569 | 0.506
+ html | doaj | 3648181 | 344208 | 0.094
+ pdf | arxiv | 2676206 | 2269708 | 0.848
+ pdf | pmc | 2402453 | 1855679 | 0.772
+ html | doi | 41492 | 1739 | 0.042
+ xml | doaj | 20638 | 6899 | 0.334
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 829 | 616 | 0.743
+ html | spn | 52 | 7 | 0.135
+ xml | doi | 1 | 0 | 0.000
+ xml | spn | 1 | 0 | 0.000
+ (16 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+---------------------------------+----------
+ pdf | success | 85709322
+ pdf | no-pdf-link | 29713304
+ pdf | no-capture | 26632191
+ pdf | redirect-loop | 10979145
+ pdf | terminal-bad-status | 4977000
+ pdf | link-loop | 3434877
+ pdf | skip-url-blocklist | 3114258
+ pdf | blocked-cookie | 2156835
+ html | wrong-scope | 1126911
+ pdf | wrong-mimetype | 980546
+ pdf | gateway-timeout | 651562
+ pdf | spn2-cdx-lookup-failure | 484016
+ pdf | spn2-backoff | 399382
+ pdf | cdx-error | 373964
+ pdf | wayback-content-error | 354370
+ html | success | 345860
+ pdf | null-body | 336182
+ pdf | spn2-error:500 | 309755
+ pdf | forbidden | 291175
+ pdf | not-found | 275560
+ pdf | too-many-redirects | 262312
+ html | unknown-scope | 230352
+ html | redirect-loop | 226596
+ html | html-resource-no-capture | 205646
+ html | no-capture | 164014
+ component | spn2-cdx-lookup-failure | 148825
+ component | wrong-mimetype | 130344
+ html | null-body | 100296
+ pdf | wayback-error | 94286
+ pdf | spn2-wayback-error | 81365
+ component | no-capture | 75278
+ pdf | spn2-error | 69830
+ pdf | skip-wall | 57744
+ pdf | spn2-error:too-many-redirects | 53808
+ pdf | remote-server-error | 41286
+ pdf | petabox-error | 38800
+ pdf | invalid-host-resolution | 37337
+ pdf | read-timeout | 36872
+ component | spn2-backoff | 33217
+ pdf | empty-blob | 27946
+ component | spn2-error | 24078
+ pdf | spn2-error:unknown | 23697
+ component | gateway-timeout | 23139
+ html | wrong-mimetype | 22731
+ html | wayback-content-error | 20507
+ pdf | spn2-error:host-crawling-paused | 19900
+ pdf | bad-redirect | 19183
+ html | terminal-bad-status | 13354
+ component | blocked-cookie | 12287
+ component | spn2-error:500 | 11271
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 38144779
+ pdf | | 32762240
+ pdf | 301 | 9433087
+ html | 200 | 1716127
+ pdf | 403 | 1416632
+ pdf | 302 | 1134668
+ pdf | 404 | 888853
+ pdf | 401 | 746311
+ pdf | 503 | 655894
+ pdf | 400 | 531479
+ component | | 337603
+ pdf | 500 | 247944
+ html | 301 | 224237
+ html | | 167194
+ pdf | 303 | 135048
+ component | 200 | 130663
+ pdf | 429 | 93489
+ pdf | 410 | 67392
+ pdf | 420 | 26722
+ pdf | 502 | 18770
+ pdf | 409 | 15152
+ pdf | 509 | 15113
+ pdf | 999 | 11747
+ html | 404 | 9879
+ pdf | 307 | 8895
+ pdf | 412 | 7053
+ pdf | 308 | 6627
+ pdf | 202 | 5289
+ xml | 200 | 2540
+ html | 500 | 2480
+ pdf | 520 | 2220
+ pdf | 521 | 1844
+ pdf | 206 | 1739
+ html | 302 | 1407
+ pdf | 504 | 1146
+ html | 303 | 1123
+ pdf | 421 | 986
+ pdf | 406 | 938
+ pdf | 204 | 498
+ pdf | 505 | 468
+ pdf | 300 | 436
+ pdf | 508 | 422
+ pdf | 426 | 405
+ html | 429 | 402
+ html | 403 | 398
+ pdf | 432 | 366
+ component | 301 | 294
+ pdf | 405 | 210
+ pdf | 226 | 166
+ component | 302 | 128
+ (50 rows)
+
diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt
new file mode 100644
index 0000000..864abd4
--- /dev/null
+++ b/sql/stats/2022-04-27_crawl_changelog.txt
@@ -0,0 +1,191 @@
+ domain | status | count
+--------------------------------------+-------------------------+--------
+ academic.oup.com | | 1243
+ academic.oup.com | spn2-cdx-lookup-failure | 990
+ aip.scitation.org | | 313
+ aip.scitation.org | spn2-cdx-lookup-failure | 224
+ ajps.uomustansiriyah.edu.iq | | 235
+ apps.crossref.org | | 1329
+ apps.crossref.org | spn2-cdx-lookup-failure | 942
+ apps.crossref.org | no-pdf-link | 387
+ archaeologydataservice.ac.uk | | 422
+ archaeologydataservice.ac.uk | spn2-cdx-lookup-failure | 289
+ arxiv.org | | 3512
+ arxiv.org | spn2-cdx-lookup-failure | 2319
+ arxiv.org | success | 1177
+ assets.researchsquare.com | | 571
+ assets.researchsquare.com | spn2-cdx-lookup-failure | 322
+ assets.researchsquare.com | success | 249
+ brill.com | | 397
+ brill.com | spn2-cdx-lookup-failure | 265
+ cla.berkeley.edu | | 239
+ classiques-garnier.com | | 249
+ cyberleninka.ru | | 340
+ cyberleninka.ru | spn2-cdx-lookup-failure | 244
+ dergipark.org.tr | | 468
+ dergipark.org.tr | spn2-cdx-lookup-failure | 333
+ dl.acm.org | | 592
+ dl.acm.org | spn2-cdx-lookup-failure | 470
+ doi.ala.org.au | | 288
+ doi.ala.org.au | spn2-cdx-lookup-failure | 220
+ doi.org | | 1107
+ doi.org | terminal-bad-status | 679
+ doi.org | spn2-cdx-lookup-failure | 415
+ downloads.hindawi.com | | 279
+ downloads.hindawi.com | success | 267
+ edbs.uomustansiriyah.edu.iq | | 294
+ edbs.uomustansiriyah.edu.iq | spn2-cdx-lookup-failure | 209
+ elibrary.kdpu.edu.ua | | 320
+ elibrary.kdpu.edu.ua | spn2-cdx-lookup-failure | 233
+ elibrary.ru | | 722
+ elibrary.ru | spn2-cdx-lookup-failure | 505
+ europepmc.org | | 986
+ europepmc.org | spn2-cdx-lookup-failure | 681
+ europepmc.org | success | 291
+ figshare.com | | 377
+ figshare.com | spn2-cdx-lookup-failure | 328
+ fjfsdata01prod.blob.core.windows.net | | 255
+ fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure | 216
+ hammer.purdue.edu | | 224
+ ieeexplore.ieee.org | | 3904
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 2654
+ ieeexplore.ieee.org | gateway-timeout | 792
+ ieeexplore.ieee.org | spn2-backoff | 419
+ journals.eco-vector.com | | 428
+ journals.eco-vector.com | spn2-cdx-lookup-failure | 306
+ journals.lww.com | | 727
+ journals.lww.com | spn2-cdx-lookup-failure | 622
+ journals.openedition.org | | 806
+ journals.openedition.org | spn2-cdx-lookup-failure | 554
+ journals.plos.org | | 348
+ journals.plos.org | spn2-cdx-lookup-failure | 244
+ kiss.kstudy.com | | 226
+ kluwerlawonline.com | | 723
+ kluwerlawonline.com | spn2-cdx-lookup-failure | 489
+ kluwerlawonline.com | link-loop | 203
+ linkinghub.elsevier.com | | 401
+ linkinghub.elsevier.com | spn2-backoff | 342
+ mdpi-res.com | | 1463
+ mdpi-res.com | success | 1337
+ muse.jhu.edu | | 346
+ muse.jhu.edu | spn2-cdx-lookup-failure | 253
+ onepetro.org | | 363
+ onepetro.org | spn2-cdx-lookup-failure | 284
+ online.ucpress.edu | | 1620
+ online.ucpress.edu | spn2-cdx-lookup-failure | 1511
+ onlinelibrary.wiley.com | | 2913
+ onlinelibrary.wiley.com | spn2-cdx-lookup-failure | 2109
+ onlinelibrary.wiley.com | terminal-bad-status | 787
+ opendata.uni-halle.de | | 519
+ opendata.uni-halle.de | spn2-cdx-lookup-failure | 343
+ osf.io | | 1554
+ osf.io | spn2-cdx-lookup-failure | 1350
+ papers.ssrn.com | | 2207
+ papers.ssrn.com | spn2-cdx-lookup-failure | 1727
+ papers.ssrn.com | link-loop | 457
+ psycharchives.org | | 384
+ psycharchives.org | spn2-cdx-lookup-failure | 283
+ publons.com | | 493
+ publons.com | spn2-cdx-lookup-failure | 348
+ pubs.acs.org | | 1240
+ pubs.acs.org | spn2-cdx-lookup-failure | 881
+ pubs.acs.org | terminal-bad-status | 298
+ pubs.rsc.org | | 603
+ pubs.rsc.org | spn2-cdx-lookup-failure | 460
+ repositories.lib.utexas.edu | | 1861
+ repositories.lib.utexas.edu | spn2-cdx-lookup-failure | 1288
+ repositories.lib.utexas.edu | terminal-bad-status | 523
+ s3-eu-west-1.amazonaws.com | | 216
+ sage.figshare.com | | 374
+ sage.figshare.com | spn2-cdx-lookup-failure | 309
+ scholar.dkyobobook.co.kr | | 220
+ scholarworks.gsu.edu | | 749
+ scholarworks.gsu.edu | spn2-cdx-lookup-failure | 577
+ tandf.figshare.com | | 214
+ www.atlantis-press.com | | 338
+ www.atlantis-press.com | spn2-cdx-lookup-failure | 214
+ www.cairn.info | | 782
+ www.cairn.info | spn2-cdx-lookup-failure | 541
+ www.cambridge.org | | 2325
+ www.cambridge.org | spn2-cdx-lookup-failure | 1787
+ www.cambridge.org | no-pdf-link | 300
+ www.cell.com | | 213
+ www.concrete.org | | 476
+ www.concrete.org | spn2-cdx-lookup-failure | 340
+ www.dbpia.co.kr | | 375
+ www.dbpia.co.kr | spn2-cdx-lookup-failure | 275
+ www.degruyter.com | | 3849
+ www.degruyter.com | spn2-cdx-lookup-failure | 2969
+ www.degruyter.com | no-pdf-link | 712
+ www.dib.ie | | 1100
+ www.dib.ie | spn2-cdx-lookup-failure | 1038
+ www.e-periodica.ch | | 821
+ www.e-periodica.ch | spn2-cdx-lookup-failure | 620
+ www.e-periodica.ch | no-pdf-link | 201
+ www.elibrary.ru | | 401
+ www.elibrary.ru | spn2-cdx-lookup-failure | 281
+ www.emerald.com | | 390
+ www.emerald.com | spn2-cdx-lookup-failure | 275
+ www.eurekaselect.com | | 275
+ www.frontiersin.org | | 1266
+ www.frontiersin.org | spn2-cdx-lookup-failure | 1025
+ www.hanspub.org | | 229
+ www.hindawi.com | | 604
+ www.hindawi.com | spn2-cdx-lookup-failure | 594
+ www.inderscience.com | | 201
+ www.jstage.jst.go.jp | | 1094
+ www.jstage.jst.go.jp | spn2-cdx-lookup-failure | 807
+ www.jstage.jst.go.jp | success | 206
+ www.mdpi.com | | 4340
+ www.mdpi.com | spn2-cdx-lookup-failure | 4258
+ www.nomos-elibrary.de | | 2749
+ www.nomos-elibrary.de | spn2-cdx-lookup-failure | 1909
+ www.nomos-elibrary.de | redirect-loop | 819
+ www.osti.gov | | 275
+ www.oxfordhandbooks.com | | 248
+ www.oxfordhandbooks.com | spn2-cdx-lookup-failure | 224
+ www.pdcnet.org | | 217
+ www.researchsquare.com | | 483
+ www.researchsquare.com | spn2-cdx-lookup-failure | 317
+ www.scielo.br | | 319
+ www.scielo.br | spn2-cdx-lookup-failure | 222
+ www.sciencedirect.com | | 3384
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 3267
+ www.spiedigitallibrary.org | | 441
+ www.spiedigitallibrary.org | spn2-cdx-lookup-failure | 327
+ www.tandfonline.com | | 2401
+ www.tandfonline.com | spn2-cdx-lookup-failure | 1552
+ www.tandfonline.com | no-pdf-link | 303
+ www.tandfonline.com | blocked-cookie | 250
+ www.taylorfrancis.com | | 1232
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 908
+ www.thieme-connect.de | | 520
+ www.thieme-connect.de | spn2-cdx-lookup-failure | 366
+ www.worldscientific.com | | 383
+ www.worldscientific.com | spn2-cdx-lookup-failure | 276
+ zenodo.org | | 10625
+ zenodo.org | spn2-cdx-lookup-failure | 7777
+ zenodo.org | success | 1574
+ zenodo.org | no-pdf-link | 1160
+ zivahub.uct.ac.za | | 3428
+ zivahub.uct.ac.za | spn2-cdx-lookup-failure | 2845
+ zivahub.uct.ac.za | no-pdf-link | 583
+ | | 130491
+ | spn2-cdx-lookup-failure | 95169
+ | success | 13354
+ | no-pdf-link | 9621
+ | terminal-bad-status | 3385
+ | spn2-backoff | 2396
+ | redirect-loop | 2216
+ | link-loop | 1850
+ | gateway-timeout | 1061
+ | spn2-error:blocked-url | 428
+ | blocked-cookie | 415
+ | spn2-error | 246
+(182 rows)
+
+----
+
+The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a
+week or two, when crawling and retries are running smoothly, and see what
+things look like then.
diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt
new file mode 100644
index 0000000..8d98217
--- /dev/null
+++ b/sql/stats/2022-05-11_crawl_changelog.txt
@@ -0,0 +1,410 @@
+ domain | status | count
+-----------------------------------------------------------------+-------------------------+--------
+ academic.oup.com | | 2210
+ academic.oup.com | no-pdf-link | 1350
+ academic.oup.com | bad-redirect | 510
+ academiccommons.columbia.edu | | 379
+ academiccommons.columbia.edu | success | 339
+ aip.scitation.org | | 762
+ aip.scitation.org | terminal-bad-status | 430
+ apps.crossref.org | | 9894
+ apps.crossref.org | no-pdf-link | 9886
+ apps.euskadi.eus | | 242
+ apps.euskadi.eus | no-pdf-link | 240
+ arxiv.org | | 44889
+ arxiv.org | success | 28781
+ arxiv.org | spn2-backoff | 7975
+ arxiv.org | terminal-bad-status | 4508
+ arxiv.org | spn2-cdx-lookup-failure | 2010
+ arxiv.org | redirect-loop | 619
+ arxiv.org | no-pdf-link | 242
+ arxiv.org | spn2-error | 236
+ asa.scitation.org | | 356
+ asa.scitation.org | terminal-bad-status | 299
+ asmedigitalcollection.asme.org | | 240
+ assets.cureus.com | | 336
+ assets.cureus.com | success | 335
+ assets.researchsquare.com | | 1042
+ assets.researchsquare.com | success | 993
+ av.tib.eu | | 205
+ av.tib.eu | no-pdf-link | 203
+ bibliographie.uni-tuebingen.de | | 213
+ bibliographie.uni-tuebingen.de | no-pdf-link | 211
+ biorxiv.org | redirect-loop | 217
+ biorxiv.org | | 217
+ books.openedition.org | | 691
+ books.openedition.org | no-pdf-link | 687
+ boris.unibe.ch | | 525
+ boris.unibe.ch | success | 466
+ bridges.monash.edu | | 663
+ bridges.monash.edu | no-pdf-link | 647
+ brill.com | | 860
+ brill.com | success | 434
+ chemrxiv.org | | 201
+ classiques-garnier.com | | 242
+ content.iospress.com | | 325
+ content.iospress.com | link-loop | 247
+ core.tdar.org | | 216
+ core.tdar.org | no-pdf-link | 211
+ cyberleninka.ru | | 646
+ cyberleninka.ru | success | 620
+ d197for5662m48.cloudfront.net | | 263
+ d197for5662m48.cloudfront.net | success | 262
+ dergipark.org.tr | | 891
+ dergipark.org.tr | success | 526
+ dergipark.org.tr | no-pdf-link | 261
+ digi.ub.uni-heidelberg.de | | 427
+ digi.ub.uni-heidelberg.de | no-pdf-link | 427
+ direct.mit.edu | | 268
+ direct.mit.edu | no-pdf-link | 208
+ dl.acm.org | | 1719
+ dl.acm.org | success | 829
+ dl.acm.org | no-pdf-link | 546
+ dl.acm.org | terminal-bad-status | 205
+ dlc.library.columbia.edu | | 385
+ dlc.library.columbia.edu | terminal-bad-status | 319
+ doi.ala.org.au | | 724
+ doi.ala.org.au | no-pdf-link | 721
+ doi.apa.org | | 214
+ doi.org | | 3390
+ doi.org | terminal-bad-status | 2938
+ doi.org | redirect-loop | 233
+ doi.org | spn2-wayback-error | 208
+ doi.usp.org | | 325
+ doi.usp.org | no-pdf-link | 324
+ downloads.hindawi.com | | 1439
+ downloads.hindawi.com | success | 1436
+ du.diva-portal.org | | 589
+ du.diva-portal.org | success | 586
+ econtents.bc.unicamp.br | | 310
+ econtents.bc.unicamp.br | success | 310
+ ediss.uni-goettingen.de | | 728
+ ediss.uni-goettingen.de | success | 425
+ elibrary.kdpu.edu.ua | | 907
+ elibrary.kdpu.edu.ua | bad-redirect | 712
+ elibrary.ru | | 925
+ elibrary.ru | terminal-bad-status | 492
+ elibrary.ru | bad-redirect | 230
+ elibrary.vdi-verlag.de | | 393
+ elifesciences.org | | 296
+ elifesciences.org | success | 276
+ europepmc.org | | 3024
+ europepmc.org | success | 2541
+ europepmc.org | terminal-bad-status | 463
+ figshare.com | | 493
+ figshare.com | no-pdf-link | 440
+ files.osf.io | | 883
+ files.osf.io | success | 686
+ fjfsdata01prod.blob.core.windows.net | | 3869
+ fjfsdata01prod.blob.core.windows.net | success | 3818
+ ieeexplore.ieee.org | | 10854
+ ieeexplore.ieee.org | gateway-timeout | 5495
+ ieeexplore.ieee.org | spn2-backoff | 1662
+ ieeexplore.ieee.org | no-pdf-link | 1417
+ ieeexplore.ieee.org | success | 1410
+ ieeexplore.ieee.org | redirect-loop | 768
+ iiif.crossasia.org | | 7608
+ iiif.crossasia.org | no-pdf-link | 7568
+ ikee.lib.auth.gr | | 450
+ ikee.lib.auth.gr | success | 332
+ ins.journals.ekb.eg | | 212
+ iopscience.iop.org | | 268
+ jamanetwork.com | | 333
+ journals.aps.org | | 414
+ journals.asm.org | | 242
+ journals.flvc.org | | 245
+ journals.flvc.org | success | 242
+ journals.healio.com | | 755
+ journals.healio.com | terminal-bad-status | 668
+ journals.lincoln.ac.nz | | 244
+ journals.lincoln.ac.nz | success | 239
+ journals.lww.com | | 1772
+ journals.lww.com | link-loop | 1425
+ journals.lww.com | spn2-backoff | 209
+ journals.openedition.org | | 1192
+ journals.openedition.org | redirect-loop | 467
+ journals.openedition.org | success | 451
+ journals.plos.org | | 771
+ journals.plos.org | success | 750
+ journals.ub.uni-heidelberg.de | | 787
+ journals.ub.uni-heidelberg.de | success | 741
+ kazanmedjournal.ru | | 240
+ kazanmedjournal.ru | success | 231
+ kiss.kstudy.com | | 219
+ kiss.kstudy.com | no-pdf-link | 218
+ kluwerlawonline.com | | 444
+ kluwerlawonline.com | link-loop | 402
+ libraetd.lib.virginia.edu | | 362
+ libraetd.lib.virginia.edu | no-pdf-link | 361
+ link.springer.com | | 305
+ linkinghub.elsevier.com | | 568
+ linkinghub.elsevier.com | spn2-backoff | 545
+ ltu-figshare-repo.s3.aarnet.edu.au | | 269
+ ltu-figshare-repo.s3.aarnet.edu.au | success | 268
+ mausamjournal.imd.gov.in | | 202
+ mdpi-res.com | | 8892
+ mdpi-res.com | success | 8863
+ mededpublish.org | | 1900
+ mededpublish.org | no-pdf-link | 1900
+ meetingorganizer.copernicus.org | | 276
+ meetingorganizer.copernicus.org | no-pdf-link | 271
+ muse.jhu.edu | | 1047
+ muse.jhu.edu | terminal-bad-status | 755
+ muse.jhu.edu | link-loop | 203
+ online.ucpress.edu | | 358
+ online.ucpress.edu | link-loop | 212
+ onlinelibrary.wiley.com | | 5813
+ onlinelibrary.wiley.com | terminal-bad-status | 4587
+ onlinelibrary.wiley.com | spn2-wayback-error | 614
+ onlinelibrary.wiley.com | blocked-cookie | 381
+ open.library.ubc.ca | | 206
+ opendata.uni-halle.de | | 1768
+ opendata.uni-halle.de | success | 1215
+ opendata.uni-halle.de | wrong-mimetype | 260
+ opendata2.uni-halle.de | | 206
+ opg.optica.org | | 205
+ osf.io | | 2949
+ osf.io | no-pdf-link | 2404
+ osf.io | spn2-backoff | 299
+ papers.ssrn.com | | 3962
+ papers.ssrn.com | link-loop | 3800
+ peerj.com | | 273
+ preprints.jmir.org | | 275
+ preprints.jmir.org | cdx-error | 255
+ publikationen.bibliothek.kit.edu | | 213
+ publons.com | | 593
+ publons.com | no-pdf-link | 590
+ pubs.acs.org | | 2288
+ pubs.acs.org | terminal-bad-status | 1841
+ pubs.acs.org | spn2-wayback-error | 210
+ pubs.rsc.org | | 1698
+ pubs.rsc.org | bad-redirect | 811
+ pubs.rsc.org | link-loop | 352
+ pubs.rsc.org | success | 307
+ radiopaedia.org | | 220
+ read.dukeupress.edu | | 303
+ repositories.lib.utexas.edu | | 1570
+ repositories.lib.utexas.edu | bad-redirect | 513
+ repositories.lib.utexas.edu | spn2-backoff | 383
+ repositories.lib.utexas.edu | gateway-timeout | 379
+ repositories.lib.utexas.edu | terminal-bad-status | 282
+ repository.uj.ac.za | | 489
+ repository.uj.ac.za | no-pdf-link | 365
+ repository.unsworks.unsw.edu.au | | 397
+ repository.urosario.edu.co | | 2429
+ repository.urosario.edu.co | success | 1648
+ repository.urosario.edu.co | bad-redirect | 613
+ rex.libraries.wsu.edu | no-pdf-link | 241
+ rex.libraries.wsu.edu | | 241
+ rsdjournal.org | | 208
+ rsdjournal.org | success | 208
+ s3-ap-southeast-2.amazonaws.com | | 282
+ s3-ap-southeast-2.amazonaws.com | success | 277
+ s3-eu-west-1.amazonaws.com | | 4615
+ s3-eu-west-1.amazonaws.com | success | 4593
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 240
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 237
+ sage.figshare.com | | 415
+ sage.figshare.com | no-pdf-link | 385
+ scholar.dkyobobook.co.kr | | 512
+ scholar.dkyobobook.co.kr | no-pdf-link | 509
+ scholarlypublishingcollective.org | | 287
+ scholarworks.gsu.edu | | 1132
+ scholarworks.gsu.edu | success | 1000
+ scholarworks.iupui.edu | | 205
+ scholarworks.umass.edu | | 417
+ scholarworks.umass.edu | success | 400
+ sciencescholar.us | | 404
+ secure.jbs.elsevierhealth.com | | 727
+ secure.jbs.elsevierhealth.com | terminal-bad-status | 722
+ tandf.figshare.com | | 354
+ tandf.figshare.com | no-pdf-link | 342
+ unsworks.unsw.edu.au | | 408
+ unsworks.unsw.edu.au | spn2-cdx-lookup-failure | 342
+ valep.vc.univie.ac.at | no-pdf-link | 737
+ valep.vc.univie.ac.at | | 737
+ watermark.silverchair.com | | 1604
+ watermark.silverchair.com | success | 1598
+ wayf.switch.ch | | 215
+ wayf.switch.ch | no-pdf-link | 213
+ www.ahajournals.org | | 438
+ www.ahajournals.org | no-pdf-link | 306
+ www.ahbps.org | | 316
+ www.ahbps.org | success | 312
+ www.atenaeditora.com.br | | 390
+ www.atenaeditora.com.br | terminal-bad-status | 333
+ www.atlantis-press.com | | 914
+ www.atlantis-press.com | success | 901
+ www.atsjournals.org | | 1245
+ www.atsjournals.org | success | 1189
+ www.biorxiv.org | | 712
+ www.biorxiv.org | success | 670
+ www.bloomsburycollections.com | | 982
+ www.bloomsburycollections.com | terminal-bad-status | 566
+ www.cahiers-clsl.ch | | 305
+ www.cahiers-clsl.ch | success | 298
+ www.cairn.info | | 1799
+ www.cairn.info | no-pdf-link | 662
+ www.cairn.info | link-loop | 487
+ www.cairn.info | success | 355
+ www.cairn.info | terminal-bad-status | 267
+ www.cambridge.org | | 3258
+ www.cambridge.org | no-pdf-link | 1682
+ www.cambridge.org | success | 682
+ www.cambridge.org | bad-redirect | 404
+ www.cambridge.org | link-loop | 302
+ www.dbpia.co.kr | | 763
+ www.dbpia.co.kr | no-pdf-link | 443
+ www.dbpia.co.kr | redirect-loop | 287
+ www.degruyter.com | | 12655
+ www.degruyter.com | no-pdf-link | 9112
+ www.degruyter.com | success | 2898
+ www.degruyter.com | spn2-backoff | 507
+ www.dib.ie | | 1381
+ www.dib.ie | no-pdf-link | 1378
+ www.dovepress.com | | 231
+ www.dovepress.com | success | 216
+ www.e-manuscripta.ch | | 767
+ www.e-manuscripta.ch | success | 399
+ www.e-periodica.ch | | 1406
+ www.e-periodica.ch | no-pdf-link | 1402
+ www.e-rara.ch | no-pdf-link | 251
+ www.e-rara.ch | | 251
+ www.editoracientifica.org | no-pdf-link | 205
+ www.editoracientifica.org | | 205
+ www.elgaronline.com | | 427
+ www.elibrary.ru | | 616
+ www.elibrary.ru | terminal-bad-status | 364
+ www.elibrary.ru | no-pdf-link | 216
+ www.emerald.com | | 862
+ www.emerald.com | no-pdf-link | 724
+ www.endocrine-abstracts.org | | 1907
+ www.endocrine-abstracts.org | no-pdf-link | 1905
+ www.eurekaselect.com | | 285
+ www.eurekaselect.com | link-loop | 246
+ www.even3.com.br | | 233
+ www.frontiersin.org | | 585
+ www.frontiersin.org | spn2-backoff | 436
+ www.humankineticslibrary.com | no-pdf-link | 207
+ www.humankineticslibrary.com | | 207
+ www.igi-global.com | | 1600
+ www.igi-global.com | no-pdf-link | 1199
+ www.igi-global.com | bad-redirect | 258
+ www.inderscience.com | | 385
+ www.inderscience.com | no-pdf-link | 365
+ www.inderscienceonline.com | | 202
+ www.ingentaconnect.com | | 450
+ www.ingentaconnect.com | no-pdf-link | 260
+ www.jstage.jst.go.jp | | 1248
+ www.jstage.jst.go.jp | success | 870
+ www.karger.com | | 313
+ www.liebertpub.com | | 271
+ www.liebertpub.com | no-pdf-link | 241
+ www.nicecjournal.co.uk | | 274
+ www.nicecjournal.co.uk | success | 274
+ www.nomos-elibrary.de | | 1771
+ www.nomos-elibrary.de | no-pdf-link | 788
+ www.nomos-elibrary.de | redirect-loop | 506
+ www.nomos-elibrary.de | bad-redirect | 207
+ www.osti.gov | | 381
+ www.osti.gov | link-loop | 326
+ www.persee.fr | | 277
+ www.preprints.org | | 225
+ www.preprints.org | success | 225
+ www.protocols.io | | 770
+ www.protocols.io | success | 485
+ www.repository.cam.ac.uk | | 510
+ www.repository.cam.ac.uk | bad-redirect | 213
+ www.research-collection.ethz.ch | | 416
+ www.research-collection.ethz.ch | bad-redirect | 249
+ www.researchsquare.com | | 1121
+ www.researchsquare.com | bad-redirect | 985
+ www.scielo.br | | 828
+ www.scielo.br | success | 641
+ www.sciencedirect.com | | 8567
+ www.sciencedirect.com | terminal-bad-status | 5773
+ www.sciencedirect.com | spn2-wayback-error | 1590
+ www.sciencedirect.com | no-pdf-link | 576
+ www.sciencedirect.com | spn2-backoff | 479
+ www.sciendo.com | | 257
+ www.sciendo.com | success | 222
+ www.scitepress.org | | 381
+ www.scitepress.org | no-pdf-link | 377
+ www.spiedigitallibrary.org | | 1061
+ www.spiedigitallibrary.org | bad-redirect | 571
+ www.spiedigitallibrary.org | gateway-timeout | 233
+ www.tandfonline.com | | 4934
+ www.tandfonline.com | no-pdf-link | 2088
+ www.tandfonline.com | terminal-bad-status | 1282
+ www.tandfonline.com | blocked-cookie | 757
+ www.tandfonline.com | redirect-loop | 488
+ www.tandfonline.com | spn2-wayback-error | 202
+ www.taylorfrancis.com | | 3979
+ www.taylorfrancis.com | link-loop | 1928
+ www.taylorfrancis.com | no-pdf-link | 1840
+ www.techniques-ingenieur.fr | | 354
+ www.techniques-ingenieur.fr | no-pdf-link | 353
+ www.thieme-connect.de | | 1987
+ www.thieme-connect.de | no-pdf-link | 949
+ www.thieme-connect.de | link-loop | 869
+ www.tib.eu | no-pdf-link | 315
+ www.tib.eu | | 315
+ www.un-ilibrary.org | no-pdf-link | 352
+ www.un-ilibrary.org | | 352
+ www.worldscientific.com | | 668
+ www.worldscientific.com | no-pdf-link | 629
+ www.zora.uzh.ch | | 318
+ zenodo.org | | 46585
+ zenodo.org | no-pdf-link | 29519
+ zenodo.org | success | 14768
+ zenodo.org | terminal-bad-status | 810
+ zenodo.org | wrong-mimetype | 691
+ zenodo.org | spn2-cdx-lookup-failure | 395
+ zenodo.org | spn2-backoff | 294
+ zivahub.uct.ac.za | | 1909
+ zivahub.uct.ac.za | no-pdf-link | 1880
+ zop.zb.uzh.ch | | 228
+ zop.zb.uzh.ch | success | 217
+ | | 365582
+ | success | 141497 38.7%
+ | no-pdf-link | 120852 33.0%
+ | terminal-bad-status | 31900 8.7%
+ | spn2-backoff | 16979 4.6%
+ | link-loop | 13624 3.7%
+ | bad-redirect | 8736
+ | redirect-loop | 7405
+ | gateway-timeout | 6997
+ | spn2-cdx-lookup-failure | 5146
+ | spn2-wayback-error | 3708
+ | wrong-mimetype | 2158
+ | blocked-cookie | 1942
+ | spn2-error:blocked-url | 1733
+ | wayback-error | 1063
+ | spn2-error | 647
+ | spn2-error:500 | 265
+ | cdx-error | 257
+(383 rows)
+
+----
+
+365k in 7 days is about 52k a day, which is about expected. Around 5-7% need
+retries.
+
+important changes:
+- biorxiv.org: needs fix and then retries
+- academic.oup.com: should probably skip
+- apps.crossref.org: need to handle this in code
+- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/`
+- doi.org: investigate redirect-loop and terminal-bad-status
+- osf.io: not getting PDFs
+- papers.ssrn.com: why are these attempted?
+- publons.com: not getting PDFs; special case these?
+- www.sciencedirect.com: not working at all?
+
+smaller:
+- bridges.monash.edu: fix, then retry?
+- dl.acm.org: some broader retries?
+- figshare.com: still some attempts, but almost all no-pdf-link
+- onlinelibrary.wiley.com: getting blocked broadly?
+- www.endocrine-abstracts.org: HTML content?
+- www.igi-global.com: no-pdf-link
diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt
new file mode 100644
index 0000000..be2b30c
--- /dev/null
+++ b/sql/stats/2022-09-06_stats.txt
@@ -0,0 +1,438 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 62 GB | 44 GB | 106 GB
+ "public"."ingest_request" | 51 GB | 50 GB | 101 GB
+ "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB
+ "public"."file_meta" | 39 GB | 39 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7469 MB | 66 MB | 7535 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 198175106 | 282695671015403
+ (1 row)
+
+ 198 million files, 282 TBytes.
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 197021437
+ text/html | 830331
+ application/octet-stream | 186669
+ application/xml | 42170
+ application/xhtml+xml | 38207
+ text/plain | 16471
+ application/jats+xml | 10385
+ application/gzip | 6681
+ | 6032
+ application/postscript | 4916
+ image/jpeg | 4522
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 946
+ application/x-bzip2 | 891
+ image/png | 659
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382
+ application/x-compress | 274
+ video/mp4 | 218
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12800
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 137283420 | 172140506
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 157465613
+ warc/revisit | 11337336
+ text/html | 1137208
+ application/octet-stream | 950380
+ text/xml | 528965
+ unk | 253294
+ application/postscript | 81130
+ application/save | 81069
+ binary/octet-stream | 68942
+ application/x-download | 42717
+ application/download | 40628
+ image/pdf | 39904
+ text/plain | 36445
+ application/force-download | 24148
+ multipart/form-data | 10972
+ application | 5409
+ application/x-octetstream | 5192
+ application/x-msdownload | 3854
+ .pdf | 3518
+ application/x-pdf | 3061
+ application/octet | 1792
+ pdf | 1757
+ application/binary | 1399
+ file | 1373
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1087
+ application/unknown | 1005
+ 0 | 773
+ text/pdf | 729
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 129001717
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 120797098
+ 500 | 8198783
+ -4 | 5802
+ 503 | 36
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 60469462
+ 0.5.5-fatcat | 47472904
+ | 12665498
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | doi | 43852308
+ pdf | mag | 43701948
+ pdf | doaj | 6534341
+ html | doaj | 3987669
+ pdf | arxiv | 2784589
+ pdf | pmc | 2439181
+ pdf | dblp | 631716
+ html | doi | 126699
+ xml | doaj | 23066
+ pdf | cnki_covid19 | 2034
+ pdf | spn | 1026
+ pdf | wanfang_covid19 | 975
+ html | spn | 65
+ xml | spn | 2
+ xml | doi | 1
+ (17 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 24742500
+ pdf | doi | fatcat-ingest | 15592121
+ pdf | doaj | doaj | 6484737
+ html | doaj | doaj | 3987468
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 799793
+ pdf | dblp | dblp | 631716
+ pdf | pmc | fatcat-ingest | 297980
+ html | doi | fatcat-ingest | 121508
+ pdf | pmc | fatcat-changelog | 112376
+ pdf | doaj | fatcat-changelog | 47181
+ xml | doaj | doaj | 23066
+ html | doi | fatcat-changelog | 5129
+ pdf | doaj | fatcat-ingest | 2423
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1814
+ pdf | spn | savepapernow-web | 1026
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ html | doaj | fatcat-ingest | 201
+ html | spn | savepapernow-web | 65
+ html | doi | savepapernow-web | 62
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 4
+ xml | spn | savepapernow-web | 2
+ xml | doi | savepapernow-web | 1
+ (30 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 167653
+ pdf | doaj | 81517
+ pdf | oai | 15282
+ html | doaj | 1791
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (6 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 16024068 | 0.313
+ pdf | unpaywall | 43932525 | 36045446 | 0.820
+ pdf | doi | 43852308 | 14956080 | 0.341
+ pdf | mag | 43701948 | 32768484 | 0.750
+ pdf | doaj | 6534341 | 4704066 | 0.720
+ html | doaj | 3987669 | 778165 | 0.195
+ pdf | arxiv | 2784589 | 2419941 | 0.869
+ pdf | pmc | 2439181 | 1897671 | 0.778
+ pdf | dblp | 631716 | 305142 | 0.483
+ html | doi | 126699 | 75754 | 0.598
+ xml | doaj | 23066 | 10381 | 0.450
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | spn | 1026 | 778 | 0.758
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ html | spn | 65 | 13 | 0.200
+ xml | spn | 2 | 1 | 0.500
+ xml | doi | 1 | 0 | 0.000
+ (17 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------+----------
+ pdf | success | 94887295
+ pdf | no-pdf-link | 33960080
+ pdf | no-capture | 20893916
+ pdf | terminal-bad-status | 6973765
+ pdf | redirect-loop | 5775175
+ pdf | link-loop | 4095424
+ pdf | skip-url-blocklist | 4037518
+ pdf | blocked-cookie | 3508762
+ html | wrong-scope | 1783694
+ pdf | wrong-mimetype | 1379673
+ html | success | 853762
+ pdf | gateway-timeout | 635170
+ html | no-capture | 381283
+ pdf | wayback-content-error | 356694
+ pdf | cdx-error | 347700
+ pdf | null-body | 336166
+ html | unknown-scope | 321874
+ html | html-resource-no-capture | 294294
+ pdf | forbidden | 291127
+ pdf | not-found | 274343
+ pdf | too-many-redirects | 264494
+ component | wrong-mimetype | 196680
+ component | spn2-cdx-lookup-failure | 173615
+ component | spn2-backoff | 115840
+ html | terminal-bad-status | 106264
+ html | null-body | 100296
+ pdf | wayback-error | 94748
+ html | blocked-cookie | 88537
+ component | no-capture | 75278
+ pdf | empty-blob | 61157
+ pdf | bad-redirect | 58680
+ pdf | skip-wall | 57751
+ pdf | spn2-error:too-many-redirects | 52873
+ html | spn2-backoff | 50577
+ pdf | remote-server-error | 41282
+ pdf | invalid-host-resolution | 38864
+ pdf | read-timeout | 37071
+ pdf | spn2-cdx-lookup-failure | 34229
+ html | wrong-mimetype | 33643
+ pdf | spn2-backoff | 32437
+ pdf | petabox-error | 31006
+ html | wayback-content-error | 28034
+ component | spn2-error | 27044
+ pdf | spn2-error:unknown | 25810
+ component | gateway-timeout | 25215
+ pdf | body-too-large | 21721
+ html | petabox-error | 18313
+ html | empty-blob | 14393
+ html | redirect-loop | 13404
+ component | blocked-cookie | 12287
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 45052391
+ pdf | | 26117481
+ pdf | 301 | 4814786
+ html | 200 | 2684821
+ pdf | 403 | 1871088
+ pdf | 404 | 1254259
+ pdf | 302 | 898728
+ pdf | 503 | 867548
+ pdf | 401 | 851205
+ pdf | 429 | 741869
+ pdf | 400 | 624519
+ component | | 456915
+ html | | 442051
+ pdf | 500 | 283700
+ component | 200 | 197510
+ pdf | 410 | 120647
+ pdf | 303 | 107947
+ html | 404 | 80114
+ pdf | 420 | 26722
+ pdf | 502 | 19500
+ pdf | 409 | 15499
+ html | 429 | 15208
+ pdf | 509 | 15167
+ pdf | 999 | 12186
+ pdf | 202 | 11535
+ html | 301 | 10213
+ xml | | 10018
+ pdf | 307 | 8657
+ pdf | 402 | 8338
+ pdf | 412 | 8064
+ pdf | 308 | 6479
+ html | 500 | 4746
+ xml | 200 | 2668
+ pdf | 520 | 2496
+ html | 302 | 2289
+ pdf | 521 | 2257
+ html | 202 | 2177
+ pdf | 206 | 1961
+ html | 403 | 1775
+ pdf | 504 | 1187
+ pdf | 421 | 1148
+ html | 303 | 1112
+ pdf | 406 | 1109
+ pdf | 204 | 772
+ pdf | 432 | 745
+ pdf | 405 | 633
+ html | 400 | 632
+ pdf | 426 | 515
+ pdf | 508 | 503
+ pdf | 505 | 469
+ (50 rows)
diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt
new file mode 100644
index 0000000..0a6254a
--- /dev/null
+++ b/sql/stats/2022-11-23_table_sizes.txt
@@ -0,0 +1,21 @@
+PostgreSQL 13.2 - wbgrp-svc506.us.archive.org
+Size: 1.13T
+
+ table_name | table_size | indexes_size | total_size
+------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 63 GB | 45 GB | 108 GB
+ "public"."ingest_request" | 53 GB | 52 GB | 105 GB
+ "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB
+ "public"."file_meta" | 39 GB | 40 GB | 79 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7879 MB | 68 MB | 7947 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
diff --git a/sql/stats/README.md b/sql/stats/README.md
index 62e213c..3161514 100644
--- a/sql/stats/README.md
+++ b/sql/stats/README.md
@@ -49,7 +49,7 @@ mimetype counts:
Counts:
- SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+ SELECT COUNT(*) AS total_files FROM grobid;
Status?
@@ -107,14 +107,3 @@ Failed ingest by terminal status code:
SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
-## Fatcat Files
-
-Count of PDF files that GROBID processed and matched to a release (via
-glutton), but no PDF in `fatcat_file`:
-
- SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
- FROM grobid
- LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
- WHERE fatcat_file.sha1hex IS NULL
- AND grobid.fatcat_release IS NOT NULL;
-